1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
16 ; These patterns are produced by LoopVectorizer for interleaved loads.
18 define void @load_i64_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
19 ; SSE-LABEL: load_i64_stride7_vf2:
21 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
22 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
23 ; SSE-NEXT: movapd 96(%rdi), %xmm0
24 ; SSE-NEXT: movapd 80(%rdi), %xmm1
25 ; SSE-NEXT: movapd 64(%rdi), %xmm2
26 ; SSE-NEXT: movapd (%rdi), %xmm3
27 ; SSE-NEXT: movapd 16(%rdi), %xmm4
28 ; SSE-NEXT: movapd 32(%rdi), %xmm5
29 ; SSE-NEXT: movapd 48(%rdi), %xmm6
30 ; SSE-NEXT: movapd %xmm6, %xmm7
31 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm3[0],xmm7[1]
32 ; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm2[0]
33 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm4[0],xmm2[1]
34 ; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm1[0]
35 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm5[0],xmm1[1]
36 ; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm0[0]
37 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm6[0],xmm0[1]
38 ; SSE-NEXT: movapd %xmm7, (%rsi)
39 ; SSE-NEXT: movapd %xmm3, (%rdx)
40 ; SSE-NEXT: movapd %xmm2, (%rcx)
41 ; SSE-NEXT: movapd %xmm4, (%r8)
42 ; SSE-NEXT: movapd %xmm1, (%r9)
43 ; SSE-NEXT: movapd %xmm5, (%r10)
44 ; SSE-NEXT: movapd %xmm0, (%rax)
47 ; AVX-LABEL: load_i64_stride7_vf2:
49 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
50 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
51 ; AVX-NEXT: vmovapd 16(%rdi), %xmm0
52 ; AVX-NEXT: vmovaps 48(%rdi), %xmm1
53 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm2
54 ; AVX-NEXT: vmovapd 80(%rdi), %xmm3
55 ; AVX-NEXT: vblendps {{.*#+}} xmm4 = mem[0,1],xmm1[2,3]
56 ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
57 ; AVX-NEXT: vblendpd {{.*#+}} xmm5 = xmm0[0],mem[1]
58 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[3]
59 ; AVX-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm3[1]
60 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm6
61 ; AVX-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
62 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
63 ; AVX-NEXT: vmovaps %xmm4, (%rsi)
64 ; AVX-NEXT: vmovdqa %xmm2, (%rdx)
65 ; AVX-NEXT: vmovapd %xmm5, (%rcx)
66 ; AVX-NEXT: vmovapd %xmm0, (%r8)
67 ; AVX-NEXT: vmovapd %xmm3, (%r9)
68 ; AVX-NEXT: vmovdqa %xmm6, (%r10)
69 ; AVX-NEXT: vmovaps %xmm1, (%rax)
70 ; AVX-NEXT: vzeroupper
73 ; AVX2-LABEL: load_i64_stride7_vf2:
75 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
76 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
77 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0
78 ; AVX2-NEXT: vmovaps 48(%rdi), %xmm1
79 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = mem[0,1],xmm1[2,3]
80 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm3
81 ; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
82 ; AVX2-NEXT: vmovaps 16(%rdi), %xmm4
83 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
84 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
85 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm5
86 ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
87 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm6
88 ; AVX2-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
89 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
90 ; AVX2-NEXT: vmovaps %xmm2, (%rsi)
91 ; AVX2-NEXT: vmovdqa %xmm3, (%rdx)
92 ; AVX2-NEXT: vmovaps %xmm4, (%rcx)
93 ; AVX2-NEXT: vextracti128 $1, %ymm0, (%r8)
94 ; AVX2-NEXT: vmovaps %xmm5, (%r9)
95 ; AVX2-NEXT: vmovdqa %xmm6, (%r10)
96 ; AVX2-NEXT: vmovaps %xmm1, (%rax)
97 ; AVX2-NEXT: vzeroupper
100 ; AVX2-FP-LABEL: load_i64_stride7_vf2:
102 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
103 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
104 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0
105 ; AVX2-FP-NEXT: vmovaps 48(%rdi), %xmm1
106 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = mem[0,1],xmm1[2,3]
107 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm3
108 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
109 ; AVX2-FP-NEXT: vmovaps 16(%rdi), %xmm4
110 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
111 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
112 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm5
113 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
114 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm6
115 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
116 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
117 ; AVX2-FP-NEXT: vmovaps %xmm2, (%rsi)
118 ; AVX2-FP-NEXT: vmovdqa %xmm3, (%rdx)
119 ; AVX2-FP-NEXT: vmovaps %xmm4, (%rcx)
120 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, (%r8)
121 ; AVX2-FP-NEXT: vmovaps %xmm5, (%r9)
122 ; AVX2-FP-NEXT: vmovdqa %xmm6, (%r10)
123 ; AVX2-FP-NEXT: vmovaps %xmm1, (%rax)
124 ; AVX2-FP-NEXT: vzeroupper
127 ; AVX2-FCP-LABEL: load_i64_stride7_vf2:
129 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
130 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
131 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0
132 ; AVX2-FCP-NEXT: vmovaps 48(%rdi), %xmm1
133 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = mem[0,1],xmm1[2,3]
134 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm3
135 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
136 ; AVX2-FCP-NEXT: vmovaps 16(%rdi), %xmm4
137 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
138 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
139 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm5
140 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
141 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm6
142 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
143 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
144 ; AVX2-FCP-NEXT: vmovaps %xmm2, (%rsi)
145 ; AVX2-FCP-NEXT: vmovdqa %xmm3, (%rdx)
146 ; AVX2-FCP-NEXT: vmovaps %xmm4, (%rcx)
147 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, (%r8)
148 ; AVX2-FCP-NEXT: vmovaps %xmm5, (%r9)
149 ; AVX2-FCP-NEXT: vmovdqa %xmm6, (%r10)
150 ; AVX2-FCP-NEXT: vmovaps %xmm1, (%rax)
151 ; AVX2-FCP-NEXT: vzeroupper
152 ; AVX2-FCP-NEXT: retq
154 ; AVX512-LABEL: load_i64_stride7_vf2:
156 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
157 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
158 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
159 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
160 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
161 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
162 ; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
163 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm5
164 ; AVX512-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
165 ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
166 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm6
167 ; AVX512-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
168 ; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
169 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm7
170 ; AVX512-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
171 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
172 ; AVX512-NEXT: vmovdqa %xmm4, (%rsi)
173 ; AVX512-NEXT: vmovdqa %xmm0, (%rdx)
174 ; AVX512-NEXT: vmovdqa %xmm5, (%rcx)
175 ; AVX512-NEXT: vmovdqa %xmm1, (%r8)
176 ; AVX512-NEXT: vmovdqa %xmm6, (%r9)
177 ; AVX512-NEXT: vmovdqa %xmm2, (%r10)
178 ; AVX512-NEXT: vmovdqa %xmm3, (%rax)
181 ; AVX512-FCP-LABEL: load_i64_stride7_vf2:
182 ; AVX512-FCP: # %bb.0:
183 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
184 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
185 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
186 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
187 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
188 ; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
189 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
190 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
191 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
192 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
193 ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm6
194 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
195 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
196 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm7
197 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
198 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
199 ; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rsi)
200 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rdx)
201 ; AVX512-FCP-NEXT: vmovdqa %xmm5, (%rcx)
202 ; AVX512-FCP-NEXT: vmovdqa %xmm1, (%r8)
203 ; AVX512-FCP-NEXT: vmovdqa %xmm6, (%r9)
204 ; AVX512-FCP-NEXT: vmovdqa %xmm2, (%r10)
205 ; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rax)
206 ; AVX512-FCP-NEXT: retq
208 ; AVX512DQ-LABEL: load_i64_stride7_vf2:
210 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
211 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
212 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
213 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
214 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2
215 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm3
216 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
217 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm5
218 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
219 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
220 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm6
221 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
222 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
223 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm7
224 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
225 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
226 ; AVX512DQ-NEXT: vmovdqa %xmm4, (%rsi)
227 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rdx)
228 ; AVX512DQ-NEXT: vmovdqa %xmm5, (%rcx)
229 ; AVX512DQ-NEXT: vmovdqa %xmm1, (%r8)
230 ; AVX512DQ-NEXT: vmovdqa %xmm6, (%r9)
231 ; AVX512DQ-NEXT: vmovdqa %xmm2, (%r10)
232 ; AVX512DQ-NEXT: vmovdqa %xmm3, (%rax)
233 ; AVX512DQ-NEXT: retq
235 ; AVX512DQ-FCP-LABEL: load_i64_stride7_vf2:
236 ; AVX512DQ-FCP: # %bb.0:
237 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
238 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
239 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
240 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
241 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
242 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
243 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
244 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
245 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
246 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
247 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm6
248 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
249 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
250 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm7
251 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
252 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
253 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rsi)
254 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rdx)
255 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%rcx)
256 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%r8)
257 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%r9)
258 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%r10)
259 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rax)
260 ; AVX512DQ-FCP-NEXT: retq
262 ; AVX512BW-LABEL: load_i64_stride7_vf2:
264 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
265 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
266 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
267 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
268 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm2
269 ; AVX512BW-NEXT: vmovdqa 48(%rdi), %xmm3
270 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
271 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %xmm5
272 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
273 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
274 ; AVX512BW-NEXT: vmovdqa 80(%rdi), %xmm6
275 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
276 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
277 ; AVX512BW-NEXT: vmovdqa 96(%rdi), %xmm7
278 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
279 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
280 ; AVX512BW-NEXT: vmovdqa %xmm4, (%rsi)
281 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rdx)
282 ; AVX512BW-NEXT: vmovdqa %xmm5, (%rcx)
283 ; AVX512BW-NEXT: vmovdqa %xmm1, (%r8)
284 ; AVX512BW-NEXT: vmovdqa %xmm6, (%r9)
285 ; AVX512BW-NEXT: vmovdqa %xmm2, (%r10)
286 ; AVX512BW-NEXT: vmovdqa %xmm3, (%rax)
287 ; AVX512BW-NEXT: retq
289 ; AVX512BW-FCP-LABEL: load_i64_stride7_vf2:
290 ; AVX512BW-FCP: # %bb.0:
291 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
292 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
293 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
294 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
295 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
296 ; AVX512BW-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
297 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
298 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
299 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
300 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
301 ; AVX512BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm6
302 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
303 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
304 ; AVX512BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm7
305 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
306 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
307 ; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rsi)
308 ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rdx)
309 ; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%rcx)
310 ; AVX512BW-FCP-NEXT: vmovdqa %xmm1, (%r8)
311 ; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%r9)
312 ; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%r10)
313 ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rax)
314 ; AVX512BW-FCP-NEXT: retq
316 ; AVX512DQ-BW-LABEL: load_i64_stride7_vf2:
317 ; AVX512DQ-BW: # %bb.0:
318 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
319 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
320 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
321 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1
322 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm2
323 ; AVX512DQ-BW-NEXT: vmovdqa 48(%rdi), %xmm3
324 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
325 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %xmm5
326 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
327 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
328 ; AVX512DQ-BW-NEXT: vmovdqa 80(%rdi), %xmm6
329 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
330 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
331 ; AVX512DQ-BW-NEXT: vmovdqa 96(%rdi), %xmm7
332 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
333 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
334 ; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rsi)
335 ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rdx)
336 ; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%rcx)
337 ; AVX512DQ-BW-NEXT: vmovdqa %xmm1, (%r8)
338 ; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%r9)
339 ; AVX512DQ-BW-NEXT: vmovdqa %xmm2, (%r10)
340 ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rax)
341 ; AVX512DQ-BW-NEXT: retq
343 ; AVX512DQ-BW-FCP-LABEL: load_i64_stride7_vf2:
344 ; AVX512DQ-BW-FCP: # %bb.0:
345 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
346 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
347 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
348 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
349 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
350 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
351 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm0[0,1],xmm3[2,3]
352 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %xmm5
353 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
354 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm5[2,3]
355 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 80(%rdi), %xmm6
356 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
357 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm2[0,1],xmm6[2,3]
358 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 96(%rdi), %xmm7
359 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
360 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm7[2,3]
361 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rsi)
362 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rdx)
363 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%rcx)
364 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, (%r8)
365 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%r9)
366 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%r10)
367 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rax)
368 ; AVX512DQ-BW-FCP-NEXT: retq
369 %wide.vec = load <14 x i64>, ptr %in.vec, align 64
370 %strided.vec0 = shufflevector <14 x i64> %wide.vec, <14 x i64> poison, <2 x i32> <i32 0, i32 7>
371 %strided.vec1 = shufflevector <14 x i64> %wide.vec, <14 x i64> poison, <2 x i32> <i32 1, i32 8>
372 %strided.vec2 = shufflevector <14 x i64> %wide.vec, <14 x i64> poison, <2 x i32> <i32 2, i32 9>
373 %strided.vec3 = shufflevector <14 x i64> %wide.vec, <14 x i64> poison, <2 x i32> <i32 3, i32 10>
374 %strided.vec4 = shufflevector <14 x i64> %wide.vec, <14 x i64> poison, <2 x i32> <i32 4, i32 11>
375 %strided.vec5 = shufflevector <14 x i64> %wide.vec, <14 x i64> poison, <2 x i32> <i32 5, i32 12>
376 %strided.vec6 = shufflevector <14 x i64> %wide.vec, <14 x i64> poison, <2 x i32> <i32 6, i32 13>
377 store <2 x i64> %strided.vec0, ptr %out.vec0, align 64
378 store <2 x i64> %strided.vec1, ptr %out.vec1, align 64
379 store <2 x i64> %strided.vec2, ptr %out.vec2, align 64
380 store <2 x i64> %strided.vec3, ptr %out.vec3, align 64
381 store <2 x i64> %strided.vec4, ptr %out.vec4, align 64
382 store <2 x i64> %strided.vec5, ptr %out.vec5, align 64
383 store <2 x i64> %strided.vec6, ptr %out.vec6, align 64
387 define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
388 ; SSE-LABEL: load_i64_stride7_vf4:
390 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
391 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
392 ; SSE-NEXT: movapd 208(%rdi), %xmm1
393 ; SSE-NEXT: movapd 96(%rdi), %xmm0
394 ; SSE-NEXT: movapd 144(%rdi), %xmm2
395 ; SSE-NEXT: movapd 192(%rdi), %xmm4
396 ; SSE-NEXT: movapd 80(%rdi), %xmm3
397 ; SSE-NEXT: movapd 128(%rdi), %xmm5
398 ; SSE-NEXT: movapd 176(%rdi), %xmm8
399 ; SSE-NEXT: movapd 64(%rdi), %xmm7
400 ; SSE-NEXT: movapd (%rdi), %xmm10
401 ; SSE-NEXT: movapd 16(%rdi), %xmm9
402 ; SSE-NEXT: movapd 32(%rdi), %xmm6
403 ; SSE-NEXT: movapd 48(%rdi), %xmm11
404 ; SSE-NEXT: movapd 112(%rdi), %xmm12
405 ; SSE-NEXT: movapd 160(%rdi), %xmm13
406 ; SSE-NEXT: movapd %xmm13, %xmm14
407 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm12[0],xmm14[1]
408 ; SSE-NEXT: movapd %xmm11, %xmm15
409 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm10[0],xmm15[1]
410 ; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm7[0]
411 ; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm8[0]
412 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm9[0],xmm7[1]
413 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm5[0],xmm8[1]
414 ; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm3[0]
415 ; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm4[0]
416 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm6[0],xmm3[1]
417 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1]
418 ; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm0[0]
419 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
420 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm11[0],xmm0[1]
421 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1]
422 ; SSE-NEXT: movapd %xmm14, 16(%rsi)
423 ; SSE-NEXT: movapd %xmm15, (%rsi)
424 ; SSE-NEXT: movapd %xmm12, 16(%rdx)
425 ; SSE-NEXT: movapd %xmm10, (%rdx)
426 ; SSE-NEXT: movapd %xmm8, 16(%rcx)
427 ; SSE-NEXT: movapd %xmm7, (%rcx)
428 ; SSE-NEXT: movapd %xmm5, 16(%r8)
429 ; SSE-NEXT: movapd %xmm9, (%r8)
430 ; SSE-NEXT: movapd %xmm4, 16(%r9)
431 ; SSE-NEXT: movapd %xmm3, (%r9)
432 ; SSE-NEXT: movapd %xmm2, 16(%r10)
433 ; SSE-NEXT: movapd %xmm6, (%r10)
434 ; SSE-NEXT: movapd %xmm1, 16(%rax)
435 ; SSE-NEXT: movapd %xmm0, (%rax)
438 ; AVX-LABEL: load_i64_stride7_vf4:
440 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
441 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
442 ; AVX-NEXT: vmovapd 192(%rdi), %ymm0
443 ; AVX-NEXT: vmovapd 128(%rdi), %ymm2
444 ; AVX-NEXT: vmovapd 160(%rdi), %ymm4
445 ; AVX-NEXT: vmovapd 96(%rdi), %ymm5
446 ; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm1
447 ; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm5[0,1,2],ymm1[3]
448 ; AVX-NEXT: vmovapd 16(%rdi), %xmm7
449 ; AVX-NEXT: vmovapd 48(%rdi), %xmm3
450 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm8
451 ; AVX-NEXT: vmovapd 80(%rdi), %xmm9
452 ; AVX-NEXT: vblendpd {{.*#+}} xmm10 = mem[0],xmm3[1]
453 ; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm10[0,1],ymm6[2,3]
454 ; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm4[1],ymm5[3],ymm4[2]
455 ; AVX-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
456 ; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm8[0,1],ymm5[2,3]
457 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm8
458 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm10
459 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm10[0,1,2],ymm4[3]
460 ; AVX-NEXT: vblendpd {{.*#+}} xmm10 = xmm7[0],mem[1]
461 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm10[0,1],ymm4[2,3]
462 ; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[1],ymm9[0],ymm7[2],ymm9[3]
463 ; AVX-NEXT: vmovdqa 192(%rdi), %xmm10
464 ; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
465 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
466 ; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3]
467 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm8
468 ; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm2[0,1,2],ymm8[3]
469 ; AVX-NEXT: vblendpd {{.*#+}} xmm9 = mem[0],xmm9[1]
470 ; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3]
471 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[3],ymm0[2]
472 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm9
473 ; AVX-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
474 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm9[0,1],ymm2[2,3]
475 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3]
476 ; AVX-NEXT: vblendpd {{.*#+}} xmm1 = xmm3[0],mem[1]
477 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
478 ; AVX-NEXT: vmovapd %ymm6, (%rsi)
479 ; AVX-NEXT: vmovapd %ymm5, (%rdx)
480 ; AVX-NEXT: vmovapd %ymm4, (%rcx)
481 ; AVX-NEXT: vmovapd %ymm7, (%r8)
482 ; AVX-NEXT: vmovapd %ymm8, (%r9)
483 ; AVX-NEXT: vmovapd %ymm2, (%r10)
484 ; AVX-NEXT: vmovapd %ymm0, (%rax)
485 ; AVX-NEXT: vzeroupper
488 ; AVX2-LABEL: load_i64_stride7_vf4:
490 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
491 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
492 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm0
493 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm2
494 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm5
495 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm6
496 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm7
497 ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm1
498 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
499 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm3[6,7]
500 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3
501 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = mem[0,1],xmm3[2,3]
502 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
503 ; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
504 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm8
505 ; AVX2-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
506 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
507 ; AVX2-NEXT: vpbroadcastq 128(%rdi), %ymm8
508 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3]
509 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm8
510 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3]
511 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
512 ; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
513 ; AVX2-NEXT: vmovdqa 192(%rdi), %xmm8
514 ; AVX2-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
515 ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
516 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm9[2,3]
517 ; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
518 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5],ymm8[6,7]
519 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm9
520 ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3]
521 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
522 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
523 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm9
524 ; AVX2-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
525 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
526 ; AVX2-NEXT: vpbroadcastq %xmm1, %ymm1
527 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
528 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],mem[2,3]
529 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
530 ; AVX2-NEXT: vmovdqa %ymm4, (%rsi)
531 ; AVX2-NEXT: vmovdqa %ymm7, (%rdx)
532 ; AVX2-NEXT: vmovdqa %ymm6, (%rcx)
533 ; AVX2-NEXT: vmovdqa %ymm5, (%r8)
534 ; AVX2-NEXT: vmovdqa %ymm8, (%r9)
535 ; AVX2-NEXT: vmovdqa %ymm2, (%r10)
536 ; AVX2-NEXT: vmovdqa %ymm0, (%rax)
537 ; AVX2-NEXT: vzeroupper
540 ; AVX2-FP-LABEL: load_i64_stride7_vf4:
542 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
543 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
544 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm0
545 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm2
546 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm5
547 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm6
548 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm7
549 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm1
550 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
551 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm3[6,7]
552 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm3
553 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = mem[0,1],xmm3[2,3]
554 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
555 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
556 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm8
557 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
558 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
559 ; AVX2-FP-NEXT: vpbroadcastq 128(%rdi), %ymm8
560 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3]
561 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm8
562 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3]
563 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
564 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
565 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm8
566 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
567 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
568 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm9[2,3]
569 ; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
570 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5],ymm8[6,7]
571 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm9
572 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3]
573 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
574 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
575 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm9
576 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
577 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
578 ; AVX2-FP-NEXT: vpbroadcastq %xmm1, %ymm1
579 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
580 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],mem[2,3]
581 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
582 ; AVX2-FP-NEXT: vmovdqa %ymm4, (%rsi)
583 ; AVX2-FP-NEXT: vmovdqa %ymm7, (%rdx)
584 ; AVX2-FP-NEXT: vmovdqa %ymm6, (%rcx)
585 ; AVX2-FP-NEXT: vmovdqa %ymm5, (%r8)
586 ; AVX2-FP-NEXT: vmovdqa %ymm8, (%r9)
587 ; AVX2-FP-NEXT: vmovdqa %ymm2, (%r10)
588 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rax)
589 ; AVX2-FP-NEXT: vzeroupper
592 ; AVX2-FCP-LABEL: load_i64_stride7_vf4:
594 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
595 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
596 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm0
597 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm2
598 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm5
599 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm6
600 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm7
601 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm1
602 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3
603 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm3[6,7]
604 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
605 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = mem[0,1],xmm3[2,3]
606 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
607 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
608 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm8
609 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
610 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
611 ; AVX2-FCP-NEXT: vpbroadcastq 128(%rdi), %ymm8
612 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm6 = ymm8[1],ymm6[1],ymm8[3],ymm6[3]
613 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm8
614 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3]
615 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
616 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
617 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm8
618 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
619 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
620 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm5[2,3],ymm9[2,3]
621 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
622 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5],ymm8[6,7]
623 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm9
624 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3]
625 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
626 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
627 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm9
628 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
629 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
630 ; AVX2-FCP-NEXT: vpbroadcastq %xmm1, %ymm1
631 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
632 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],mem[2,3]
633 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
634 ; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rsi)
635 ; AVX2-FCP-NEXT: vmovdqa %ymm7, (%rdx)
636 ; AVX2-FCP-NEXT: vmovdqa %ymm6, (%rcx)
637 ; AVX2-FCP-NEXT: vmovdqa %ymm5, (%r8)
638 ; AVX2-FCP-NEXT: vmovdqa %ymm8, (%r9)
639 ; AVX2-FCP-NEXT: vmovdqa %ymm2, (%r10)
640 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax)
641 ; AVX2-FCP-NEXT: vzeroupper
642 ; AVX2-FCP-NEXT: retq
644 ; AVX512-LABEL: load_i64_stride7_vf4:
646 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
647 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
648 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm3
649 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm4
650 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm5
651 ; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0
652 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0]
653 ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm1
654 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
655 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0]
656 ; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm1
657 ; AVX512-NEXT: vpbroadcastq 176(%rdi), %ymm2
658 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
659 ; AVX512-NEXT: vmovdqa 128(%rdi), %xmm6
660 ; AVX512-NEXT: vpbroadcastq %xmm6, %ymm2
661 ; AVX512-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
662 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm7
663 ; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3]
664 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
665 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm7
666 ; AVX512-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
667 ; AVX512-NEXT: vmovdqa 192(%rdi), %xmm8
668 ; AVX512-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
669 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
670 ; AVX512-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3]
671 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm7
672 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm8
673 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
674 ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
675 ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
676 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
677 ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm9
678 ; AVX512-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
679 ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12]
680 ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
681 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
682 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11]
683 ; AVX512-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9
684 ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13]
685 ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm3
686 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
687 ; AVX512-NEXT: vmovdqa %ymm0, (%rsi)
688 ; AVX512-NEXT: vmovdqa %ymm1, (%rdx)
689 ; AVX512-NEXT: vmovdqa %ymm2, (%rcx)
690 ; AVX512-NEXT: vmovdqa %ymm6, (%r8)
691 ; AVX512-NEXT: vmovdqa %ymm7, (%r9)
692 ; AVX512-NEXT: vmovdqa %ymm8, (%r10)
693 ; AVX512-NEXT: vmovdqa %ymm3, (%rax)
694 ; AVX512-NEXT: vzeroupper
697 ; AVX512-FCP-LABEL: load_i64_stride7_vf4:
698 ; AVX512-FCP: # %bb.0:
699 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
700 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
701 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3
702 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4
703 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm5
704 ; AVX512-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0
705 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0]
706 ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm1
707 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
708 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0]
709 ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1
710 ; AVX512-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2
711 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
712 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
713 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
714 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7]
715 ; AVX512-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm6
716 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
717 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
718 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
719 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %xmm7
720 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
721 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
722 ; AVX512-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3]
723 ; AVX512-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
724 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm8
725 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
726 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
727 ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
728 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
729 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm9
730 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
731 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12]
732 ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
733 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
734 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11]
735 ; AVX512-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9
736 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13]
737 ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm3
738 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
739 ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rsi)
740 ; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rdx)
741 ; AVX512-FCP-NEXT: vmovdqa %ymm2, (%rcx)
742 ; AVX512-FCP-NEXT: vmovdqa %ymm6, (%r8)
743 ; AVX512-FCP-NEXT: vmovdqa %ymm7, (%r9)
744 ; AVX512-FCP-NEXT: vmovdqa %ymm8, (%r10)
745 ; AVX512-FCP-NEXT: vmovdqa %ymm3, (%rax)
746 ; AVX512-FCP-NEXT: vzeroupper
747 ; AVX512-FCP-NEXT: retq
749 ; AVX512DQ-LABEL: load_i64_stride7_vf4:
751 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
752 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
753 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm3
754 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm4
755 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm5
756 ; AVX512DQ-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0
757 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0]
758 ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm1
759 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
760 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0]
761 ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm1
762 ; AVX512DQ-NEXT: vpbroadcastq 176(%rdi), %ymm2
763 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
764 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %xmm6
765 ; AVX512DQ-NEXT: vpbroadcastq %xmm6, %ymm2
766 ; AVX512DQ-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
767 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm7
768 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3]
769 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
770 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm7
771 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
772 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm8
773 ; AVX512DQ-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
774 ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
775 ; AVX512DQ-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3]
776 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm7
777 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm8
778 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
779 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
780 ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
781 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
782 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm9
783 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
784 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12]
785 ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
786 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
787 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11]
788 ; AVX512DQ-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9
789 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13]
790 ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm3
791 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
792 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rsi)
793 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%rdx)
794 ; AVX512DQ-NEXT: vmovdqa %ymm2, (%rcx)
795 ; AVX512DQ-NEXT: vmovdqa %ymm6, (%r8)
796 ; AVX512DQ-NEXT: vmovdqa %ymm7, (%r9)
797 ; AVX512DQ-NEXT: vmovdqa %ymm8, (%r10)
798 ; AVX512DQ-NEXT: vmovdqa %ymm3, (%rax)
799 ; AVX512DQ-NEXT: vzeroupper
800 ; AVX512DQ-NEXT: retq
802 ; AVX512DQ-FCP-LABEL: load_i64_stride7_vf4:
803 ; AVX512DQ-FCP: # %bb.0:
804 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
805 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
806 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3
807 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4
808 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm5
809 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0
810 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0]
811 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm1
812 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
813 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0]
814 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1
815 ; AVX512DQ-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2
816 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
817 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
818 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
819 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7]
820 ; AVX512DQ-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm6
821 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
822 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
823 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
824 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %xmm7
825 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
826 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
827 ; AVX512DQ-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3]
828 ; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
829 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm8
830 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
831 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
832 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
833 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
834 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm9
835 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
836 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12]
837 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
838 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
839 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11]
840 ; AVX512DQ-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9
841 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13]
842 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm3
843 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
844 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rsi)
845 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rdx)
846 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%rcx)
847 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%r8)
848 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%r9)
849 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%r10)
850 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm3, (%rax)
851 ; AVX512DQ-FCP-NEXT: vzeroupper
852 ; AVX512DQ-FCP-NEXT: retq
854 ; AVX512BW-LABEL: load_i64_stride7_vf4:
856 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
857 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
858 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm3
859 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm4
860 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm5
861 ; AVX512BW-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0
862 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0]
863 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm1
864 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
865 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0]
866 ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm1
867 ; AVX512BW-NEXT: vpbroadcastq 176(%rdi), %ymm2
868 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
869 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %xmm6
870 ; AVX512BW-NEXT: vpbroadcastq %xmm6, %ymm2
871 ; AVX512BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
872 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm7
873 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3]
874 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
875 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm7
876 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
877 ; AVX512BW-NEXT: vmovdqa 192(%rdi), %xmm8
878 ; AVX512BW-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
879 ; AVX512BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
880 ; AVX512BW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3]
881 ; AVX512BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm7
882 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm8
883 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
884 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
885 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
886 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
887 ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm9
888 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
889 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12]
890 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
891 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
892 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11]
893 ; AVX512BW-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9
894 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13]
895 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm3
896 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
897 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
898 ; AVX512BW-NEXT: vmovdqa %ymm1, (%rdx)
899 ; AVX512BW-NEXT: vmovdqa %ymm2, (%rcx)
900 ; AVX512BW-NEXT: vmovdqa %ymm6, (%r8)
901 ; AVX512BW-NEXT: vmovdqa %ymm7, (%r9)
902 ; AVX512BW-NEXT: vmovdqa %ymm8, (%r10)
903 ; AVX512BW-NEXT: vmovdqa %ymm3, (%rax)
904 ; AVX512BW-NEXT: vzeroupper
905 ; AVX512BW-NEXT: retq
907 ; AVX512BW-FCP-LABEL: load_i64_stride7_vf4:
908 ; AVX512BW-FCP: # %bb.0:
909 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
910 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
911 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3
912 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4
913 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5
914 ; AVX512BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0
915 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0]
916 ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm1
917 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
918 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0]
919 ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1
920 ; AVX512BW-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2
921 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
922 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
923 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
924 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7]
925 ; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm6
926 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
927 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
928 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
929 ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm7
930 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
931 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
932 ; AVX512BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3]
933 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
934 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm8
935 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
936 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
937 ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
938 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
939 ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm9
940 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
941 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12]
942 ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
943 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
944 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11]
945 ; AVX512BW-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9
946 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13]
947 ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm3
948 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
949 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi)
950 ; AVX512BW-FCP-NEXT: vmovdqa %ymm1, (%rdx)
951 ; AVX512BW-FCP-NEXT: vmovdqa %ymm2, (%rcx)
952 ; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%r8)
953 ; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%r9)
954 ; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%r10)
955 ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rax)
956 ; AVX512BW-FCP-NEXT: vzeroupper
957 ; AVX512BW-FCP-NEXT: retq
959 ; AVX512DQ-BW-LABEL: load_i64_stride7_vf4:
960 ; AVX512DQ-BW: # %bb.0:
961 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
962 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
963 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm3
964 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm4
965 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm5
966 ; AVX512DQ-BW-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0
967 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0]
968 ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm1
969 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
970 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0]
971 ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm1
972 ; AVX512DQ-BW-NEXT: vpbroadcastq 176(%rdi), %ymm2
973 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
974 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %xmm6
975 ; AVX512DQ-BW-NEXT: vpbroadcastq %xmm6, %ymm2
976 ; AVX512DQ-BW-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
977 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm7
978 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0,1],mem[2,3]
979 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm7[0,1,2,3],ymm2[4,5,6,7]
980 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm7
981 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
982 ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %xmm8
983 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} xmm6 = xmm6[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
984 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
985 ; AVX512DQ-BW-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm7[2,3],ymm6[2,3]
986 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm7
987 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm8
988 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
989 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
990 ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
991 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
992 ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm9
993 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
994 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12]
995 ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
996 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
997 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11]
998 ; AVX512DQ-BW-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9
999 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13]
1000 ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm3
1001 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
1002 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rsi)
1003 ; AVX512DQ-BW-NEXT: vmovdqa %ymm1, (%rdx)
1004 ; AVX512DQ-BW-NEXT: vmovdqa %ymm2, (%rcx)
1005 ; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%r8)
1006 ; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%r9)
1007 ; AVX512DQ-BW-NEXT: vmovdqa %ymm8, (%r10)
1008 ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rax)
1009 ; AVX512DQ-BW-NEXT: vzeroupper
1010 ; AVX512DQ-BW-NEXT: retq
1012 ; AVX512DQ-BW-FCP-LABEL: load_i64_stride7_vf4:
1013 ; AVX512DQ-BW-FCP: # %bb.0:
1014 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1015 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1016 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm3
1017 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm4
1018 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm5
1019 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 160(%rdi), %ymm0, %ymm0
1020 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [0,7,14,0]
1021 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm1
1022 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
1023 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm1 = [9,0,7,0]
1024 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm1
1025 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq 176(%rdi), %ymm2
1026 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
1027 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
1028 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
1029 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm6 = [0,0,0,7]
1030 ; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm6
1031 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
1032 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
1033 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
1034 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm7
1035 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} xmm8 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
1036 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
1037 ; AVX512DQ-BW-FCP-NEXT: vperm2i128 {{.*#+}} ymm6 = ymm6[2,3],ymm8[2,3]
1038 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
1039 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm8
1040 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
1041 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
1042 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
1043 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
1044 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm9
1045 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
1046 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [5,12]
1047 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm9
1048 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1049 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm9 = [0,0,4,11]
1050 ; AVX512DQ-BW-FCP-NEXT: vpermi2q 192(%rdi), %zmm3, %zmm9
1051 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [6,13]
1052 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm3
1053 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm9[4,5,6,7]
1054 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi)
1055 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm1, (%rdx)
1056 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm2, (%rcx)
1057 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%r8)
1058 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%r9)
1059 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%r10)
1060 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rax)
1061 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
1062 ; AVX512DQ-BW-FCP-NEXT: retq
1063 %wide.vec = load <28 x i64>, ptr %in.vec, align 64
1064 %strided.vec0 = shufflevector <28 x i64> %wide.vec, <28 x i64> poison, <4 x i32> <i32 0, i32 7, i32 14, i32 21>
1065 %strided.vec1 = shufflevector <28 x i64> %wide.vec, <28 x i64> poison, <4 x i32> <i32 1, i32 8, i32 15, i32 22>
1066 %strided.vec2 = shufflevector <28 x i64> %wide.vec, <28 x i64> poison, <4 x i32> <i32 2, i32 9, i32 16, i32 23>
1067 %strided.vec3 = shufflevector <28 x i64> %wide.vec, <28 x i64> poison, <4 x i32> <i32 3, i32 10, i32 17, i32 24>
1068 %strided.vec4 = shufflevector <28 x i64> %wide.vec, <28 x i64> poison, <4 x i32> <i32 4, i32 11, i32 18, i32 25>
1069 %strided.vec5 = shufflevector <28 x i64> %wide.vec, <28 x i64> poison, <4 x i32> <i32 5, i32 12, i32 19, i32 26>
1070 %strided.vec6 = shufflevector <28 x i64> %wide.vec, <28 x i64> poison, <4 x i32> <i32 6, i32 13, i32 20, i32 27>
1071 store <4 x i64> %strided.vec0, ptr %out.vec0, align 64
1072 store <4 x i64> %strided.vec1, ptr %out.vec1, align 64
1073 store <4 x i64> %strided.vec2, ptr %out.vec2, align 64
1074 store <4 x i64> %strided.vec3, ptr %out.vec3, align 64
1075 store <4 x i64> %strided.vec4, ptr %out.vec4, align 64
1076 store <4 x i64> %strided.vec5, ptr %out.vec5, align 64
1077 store <4 x i64> %strided.vec6, ptr %out.vec6, align 64
1081 define void @load_i64_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
1082 ; SSE-LABEL: load_i64_stride7_vf8:
1084 ; SSE-NEXT: subq $88, %rsp
1085 ; SSE-NEXT: movapd 320(%rdi), %xmm1
1086 ; SSE-NEXT: movapd 208(%rdi), %xmm0
1087 ; SSE-NEXT: movapd 256(%rdi), %xmm3
1088 ; SSE-NEXT: movapd 144(%rdi), %xmm2
1089 ; SSE-NEXT: movapd 304(%rdi), %xmm5
1090 ; SSE-NEXT: movapd 192(%rdi), %xmm4
1091 ; SSE-NEXT: movapd 240(%rdi), %xmm7
1092 ; SSE-NEXT: movapd 128(%rdi), %xmm6
1093 ; SSE-NEXT: movapd 288(%rdi), %xmm9
1094 ; SSE-NEXT: movapd 176(%rdi), %xmm8
1095 ; SSE-NEXT: movapd 336(%rdi), %xmm10
1096 ; SSE-NEXT: movapd 224(%rdi), %xmm11
1097 ; SSE-NEXT: movapd 272(%rdi), %xmm14
1098 ; SSE-NEXT: movapd 112(%rdi), %xmm13
1099 ; SSE-NEXT: movapd 160(%rdi), %xmm15
1100 ; SSE-NEXT: movapd %xmm15, %xmm12
1101 ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm13[0],xmm12[1]
1102 ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1103 ; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm8[0]
1104 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm6[0],xmm8[1]
1105 ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1106 ; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm4[0]
1107 ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1108 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm2[0],xmm4[1]
1109 ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1110 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0]
1111 ; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill
1112 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1]
1113 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1114 ; SSE-NEXT: movapd %xmm14, %xmm12
1115 ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm11[0],xmm12[1]
1116 ; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm9[0]
1117 ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1118 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm7[0],xmm9[1]
1119 ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1120 ; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm5[0]
1121 ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1122 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm3[0],xmm5[1]
1123 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1124 ; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm1[0]
1125 ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1126 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1]
1127 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1128 ; SSE-NEXT: movapd 384(%rdi), %xmm2
1129 ; SSE-NEXT: movapd %xmm2, %xmm11
1130 ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm10[0],xmm11[1]
1131 ; SSE-NEXT: movapd 400(%rdi), %xmm7
1132 ; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm7[0]
1133 ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1134 ; SSE-NEXT: movapd 352(%rdi), %xmm8
1135 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm8[0],xmm7[1]
1136 ; SSE-NEXT: movapd 416(%rdi), %xmm10
1137 ; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm10[0]
1138 ; SSE-NEXT: movapd 368(%rdi), %xmm15
1139 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm15[0],xmm10[1]
1140 ; SSE-NEXT: movapd 432(%rdi), %xmm14
1141 ; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm14[0]
1142 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1]
1143 ; SSE-NEXT: movapd (%rdi), %xmm2
1144 ; SSE-NEXT: movapd 48(%rdi), %xmm9
1145 ; SSE-NEXT: movapd %xmm9, %xmm3
1146 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1]
1147 ; SSE-NEXT: movapd 64(%rdi), %xmm0
1148 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm0[0]
1149 ; SSE-NEXT: movapd 16(%rdi), %xmm1
1150 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1151 ; SSE-NEXT: movapd 80(%rdi), %xmm4
1152 ; SSE-NEXT: shufpd {{.*#+}} xmm1 = xmm1[1],xmm4[0]
1153 ; SSE-NEXT: movapd 32(%rdi), %xmm5
1154 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm5[0],xmm4[1]
1155 ; SSE-NEXT: movapd 96(%rdi), %xmm6
1156 ; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm6[0]
1157 ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm9[0],xmm6[1]
1158 ; SSE-NEXT: movapd %xmm3, (%rsi)
1159 ; SSE-NEXT: movapd %xmm11, 48(%rsi)
1160 ; SSE-NEXT: movapd %xmm12, 32(%rsi)
1161 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1162 ; SSE-NEXT: movaps %xmm3, 16(%rsi)
1163 ; SSE-NEXT: movapd %xmm2, (%rdx)
1164 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1165 ; SSE-NEXT: movaps %xmm2, 48(%rdx)
1166 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1167 ; SSE-NEXT: movaps %xmm2, 32(%rdx)
1168 ; SSE-NEXT: movapd %xmm13, 16(%rdx)
1169 ; SSE-NEXT: movapd %xmm0, (%rcx)
1170 ; SSE-NEXT: movapd %xmm7, 48(%rcx)
1171 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1172 ; SSE-NEXT: movaps %xmm0, 32(%rcx)
1173 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1174 ; SSE-NEXT: movaps %xmm0, 16(%rcx)
1175 ; SSE-NEXT: movapd %xmm1, (%r8)
1176 ; SSE-NEXT: movapd %xmm8, 48(%r8)
1177 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1178 ; SSE-NEXT: movaps %xmm0, 32(%r8)
1179 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1180 ; SSE-NEXT: movaps %xmm0, 16(%r8)
1181 ; SSE-NEXT: movapd %xmm4, (%r9)
1182 ; SSE-NEXT: movapd %xmm10, 48(%r9)
1183 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1184 ; SSE-NEXT: movaps %xmm0, 32(%r9)
1185 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1186 ; SSE-NEXT: movaps %xmm0, 16(%r9)
1187 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1188 ; SSE-NEXT: movapd %xmm5, (%rax)
1189 ; SSE-NEXT: movapd %xmm15, 48(%rax)
1190 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1191 ; SSE-NEXT: movaps %xmm0, 32(%rax)
1192 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1193 ; SSE-NEXT: movaps %xmm0, 16(%rax)
1194 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1195 ; SSE-NEXT: movapd %xmm6, (%rax)
1196 ; SSE-NEXT: movapd %xmm14, 48(%rax)
1197 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1198 ; SSE-NEXT: movaps %xmm0, 32(%rax)
1199 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1200 ; SSE-NEXT: movaps %xmm0, 16(%rax)
1201 ; SSE-NEXT: addq $88, %rsp
1204 ; AVX-LABEL: load_i64_stride7_vf8:
1206 ; AVX-NEXT: vmovapd 384(%rdi), %ymm9
1207 ; AVX-NEXT: vmovapd 160(%rdi), %ymm8
1208 ; AVX-NEXT: vmovapd 320(%rdi), %ymm7
1209 ; AVX-NEXT: vmovapd 96(%rdi), %ymm6
1210 ; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0
1211 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1212 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3]
1213 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm11
1214 ; AVX-NEXT: vmovapd 48(%rdi), %xmm1
1215 ; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1216 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm12
1217 ; AVX-NEXT: vmovapd 80(%rdi), %xmm10
1218 ; AVX-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm1[1]
1219 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3]
1220 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1221 ; AVX-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm3
1222 ; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1,2],ymm3[3]
1223 ; AVX-NEXT: vmovapd 224(%rdi), %xmm13
1224 ; AVX-NEXT: vmovapd 272(%rdi), %xmm4
1225 ; AVX-NEXT: vblendpd {{.*#+}} xmm14 = xmm13[0],xmm4[1]
1226 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm5[2,3]
1227 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1228 ; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[0],ymm8[1],ymm6[3],ymm8[2]
1229 ; AVX-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7]
1230 ; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm14[0,1],ymm6[2,3]
1231 ; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm9[1],ymm7[3],ymm9[2]
1232 ; AVX-NEXT: vmovdqa 288(%rdi), %xmm14
1233 ; AVX-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7]
1234 ; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm13[0,1],ymm7[2,3]
1235 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm13
1236 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm14
1237 ; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm14[0,1,2],ymm8[3]
1238 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm11[0,1,2,3],xmm12[4,5,6,7]
1239 ; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,3]
1240 ; AVX-NEXT: vmovdqa 352(%rdi), %xmm12
1241 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm14
1242 ; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm14[0,1,2],ymm9[3]
1243 ; AVX-NEXT: vmovapd 240(%rdi), %xmm14
1244 ; AVX-NEXT: vblendpd {{.*#+}} xmm15 = xmm14[0],mem[1]
1245 ; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm15[0,1],ymm9[2,3]
1246 ; AVX-NEXT: vshufpd {{.*#+}} ymm11 = ymm11[1],ymm10[0],ymm11[2],ymm10[3]
1247 ; AVX-NEXT: vmovdqa 192(%rdi), %xmm15
1248 ; AVX-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
1249 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13
1250 ; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm13[2,3]
1251 ; AVX-NEXT: vmovapd 304(%rdi), %xmm13
1252 ; AVX-NEXT: vshufpd {{.*#+}} ymm14 = ymm14[1],ymm13[0],ymm14[2],ymm13[3]
1253 ; AVX-NEXT: vmovdqa 416(%rdi), %xmm0
1254 ; AVX-NEXT: vpalignr {{.*#+}} xmm12 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1255 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
1256 ; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm14[0,1],ymm12[2,3]
1257 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm14
1258 ; AVX-NEXT: vmovapd 128(%rdi), %ymm15
1259 ; AVX-NEXT: vblendpd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3]
1260 ; AVX-NEXT: vblendpd {{.*#+}} xmm10 = mem[0],xmm10[1]
1261 ; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm14[2,3]
1262 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1263 ; AVX-NEXT: vmovapd 352(%rdi), %ymm14
1264 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3]
1265 ; AVX-NEXT: vmovapd 256(%rdi), %xmm1
1266 ; AVX-NEXT: vblendpd {{.*#+}} xmm13 = xmm1[0],xmm13[1]
1267 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm13[0,1],ymm0[2,3]
1268 ; AVX-NEXT: vmovapd 192(%rdi), %ymm13
1269 ; AVX-NEXT: vshufpd {{.*#+}} ymm15 = ymm15[0],ymm13[1],ymm15[3],ymm13[2]
1270 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm5
1271 ; AVX-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
1272 ; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm15[2,3]
1273 ; AVX-NEXT: vmovapd 416(%rdi), %ymm15
1274 ; AVX-NEXT: vshufpd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[3],ymm15[2]
1275 ; AVX-NEXT: vmovdqa 320(%rdi), %xmm0
1276 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1277 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm14[2,3]
1278 ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
1279 ; AVX-NEXT: # ymm13 = mem[0,1,2],ymm13[3]
1280 ; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
1281 ; AVX-NEXT: vblendpd {{.*#+}} xmm14 = xmm14[0],mem[1]
1282 ; AVX-NEXT: vblendpd {{.*#+}} ymm13 = ymm14[0,1],ymm13[2,3]
1283 ; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1,2],ymm15[3]
1284 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm4[0],xmm0[1]
1285 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3]
1286 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1287 ; AVX-NEXT: vmovaps %ymm3, 32(%rsi)
1288 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1289 ; AVX-NEXT: vmovaps %ymm3, (%rsi)
1290 ; AVX-NEXT: vmovapd %ymm7, 32(%rdx)
1291 ; AVX-NEXT: vmovapd %ymm6, (%rdx)
1292 ; AVX-NEXT: vmovapd %ymm9, 32(%rcx)
1293 ; AVX-NEXT: vmovapd %ymm8, (%rcx)
1294 ; AVX-NEXT: vmovapd %ymm12, 32(%r8)
1295 ; AVX-NEXT: vmovapd %ymm11, (%r8)
1296 ; AVX-NEXT: vmovapd %ymm2, 32(%r9)
1297 ; AVX-NEXT: vmovapd %ymm10, (%r9)
1298 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
1299 ; AVX-NEXT: vmovapd %ymm1, 32(%rax)
1300 ; AVX-NEXT: vmovapd %ymm5, (%rax)
1301 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
1302 ; AVX-NEXT: vmovapd %ymm0, 32(%rax)
1303 ; AVX-NEXT: vmovapd %ymm13, (%rax)
1304 ; AVX-NEXT: vzeroupper
1307 ; AVX2-LABEL: load_i64_stride7_vf8:
1309 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm4
1310 ; AVX2-NEXT: vmovdqa 288(%rdi), %ymm12
1311 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm11
1312 ; AVX2-NEXT: vmovdqa 384(%rdi), %ymm10
1313 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm9
1314 ; AVX2-NEXT: vmovdqa 320(%rdi), %ymm8
1315 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm7
1316 ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm0
1317 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1318 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1319 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
1320 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm2
1321 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = mem[0,1],xmm2[2,3]
1322 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
1323 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1324 ; AVX2-NEXT: vmovdqa 384(%rdi), %xmm3
1325 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5
1326 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm5[6,7]
1327 ; AVX2-NEXT: vmovdqa 224(%rdi), %xmm13
1328 ; AVX2-NEXT: vmovdqa 272(%rdi), %xmm5
1329 ; AVX2-NEXT: vpblendd {{.*#+}} xmm14 = xmm13[0,1],xmm5[2,3]
1330 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm6[4,5,6,7]
1331 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1332 ; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
1333 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm14
1334 ; AVX2-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7]
1335 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7]
1336 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1337 ; AVX2-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
1338 ; AVX2-NEXT: vmovdqa 288(%rdi), %xmm15
1339 ; AVX2-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
1340 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
1341 ; AVX2-NEXT: vpbroadcastq 128(%rdi), %ymm13
1342 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm13[1],ymm9[1],ymm13[3],ymm9[3]
1343 ; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = mem[0,1],xmm14[2,3]
1344 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
1345 ; AVX2-NEXT: vpbroadcastq 352(%rdi), %ymm13
1346 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm13[1],ymm10[1],ymm13[3],ymm10[3]
1347 ; AVX2-NEXT: vmovdqa 240(%rdi), %xmm13
1348 ; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
1349 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7]
1350 ; AVX2-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
1351 ; AVX2-NEXT: vmovdqa 192(%rdi), %xmm13
1352 ; AVX2-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
1353 ; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
1354 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3]
1355 ; AVX2-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
1356 ; AVX2-NEXT: vmovdqa 416(%rdi), %xmm14
1357 ; AVX2-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7]
1358 ; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
1359 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3]
1360 ; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
1361 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm13[6,7]
1362 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm15
1363 ; AVX2-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],mem[2,3]
1364 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7]
1365 ; AVX2-NEXT: vmovdqa 352(%rdi), %ymm15
1366 ; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
1367 ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7]
1368 ; AVX2-NEXT: vmovdqa 256(%rdi), %xmm1
1369 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],mem[2,3]
1370 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7]
1371 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm14
1372 ; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
1373 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm7
1374 ; AVX2-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
1375 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
1376 ; AVX2-NEXT: vmovdqa 416(%rdi), %ymm7
1377 ; AVX2-NEXT: vpalignr {{.*#+}} ymm15 = ymm15[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
1378 ; AVX2-NEXT: vmovdqa 320(%rdi), %xmm0
1379 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1380 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7]
1381 ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload
1382 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
1383 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
1384 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
1385 ; AVX2-NEXT: vpbroadcastq %xmm3, %ymm3
1386 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3]
1387 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
1388 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
1389 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1390 ; AVX2-NEXT: vmovaps %ymm3, 32(%rsi)
1391 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1392 ; AVX2-NEXT: vmovaps %ymm3, (%rsi)
1393 ; AVX2-NEXT: vmovdqa %ymm8, 32(%rdx)
1394 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1395 ; AVX2-NEXT: vmovaps %ymm3, (%rdx)
1396 ; AVX2-NEXT: vmovdqa %ymm10, 32(%rcx)
1397 ; AVX2-NEXT: vmovdqa %ymm9, (%rcx)
1398 ; AVX2-NEXT: vmovdqa %ymm12, 32(%r8)
1399 ; AVX2-NEXT: vmovdqa %ymm11, (%r8)
1400 ; AVX2-NEXT: vmovdqa %ymm6, 32(%r9)
1401 ; AVX2-NEXT: vmovdqa %ymm13, (%r9)
1402 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
1403 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rax)
1404 ; AVX2-NEXT: vmovdqa %ymm4, (%rax)
1405 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
1406 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rax)
1407 ; AVX2-NEXT: vmovdqa %ymm2, (%rax)
1408 ; AVX2-NEXT: vzeroupper
1411 ; AVX2-FP-LABEL: load_i64_stride7_vf8:
1413 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm4
1414 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm12
1415 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm11
1416 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm10
1417 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm9
1418 ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm8
1419 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm7
1420 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm0
1421 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1422 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1423 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
1424 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm2
1425 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = mem[0,1],xmm2[2,3]
1426 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
1427 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1428 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %xmm3
1429 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5
1430 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm5[6,7]
1431 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %xmm13
1432 ; AVX2-FP-NEXT: vmovdqa 272(%rdi), %xmm5
1433 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm13[0,1],xmm5[2,3]
1434 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm6[4,5,6,7]
1435 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1436 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
1437 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm14
1438 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7]
1439 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7]
1440 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1441 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
1442 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm15
1443 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
1444 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
1445 ; AVX2-FP-NEXT: vpbroadcastq 128(%rdi), %ymm13
1446 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm13[1],ymm9[1],ymm13[3],ymm9[3]
1447 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = mem[0,1],xmm14[2,3]
1448 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
1449 ; AVX2-FP-NEXT: vpbroadcastq 352(%rdi), %ymm13
1450 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm13[1],ymm10[1],ymm13[3],ymm10[3]
1451 ; AVX2-FP-NEXT: vmovdqa 240(%rdi), %xmm13
1452 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
1453 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7]
1454 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
1455 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm13
1456 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
1457 ; AVX2-FP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
1458 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3]
1459 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
1460 ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %xmm14
1461 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7]
1462 ; AVX2-FP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
1463 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3]
1464 ; AVX2-FP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
1465 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm13[6,7]
1466 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm15
1467 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],mem[2,3]
1468 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7]
1469 ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm15
1470 ; AVX2-FP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
1471 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7]
1472 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm1
1473 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],mem[2,3]
1474 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7]
1475 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm14
1476 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
1477 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm7
1478 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
1479 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
1480 ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm7
1481 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm15 = ymm15[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
1482 ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm0
1483 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1484 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7]
1485 ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload
1486 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
1487 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
1488 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
1489 ; AVX2-FP-NEXT: vpbroadcastq %xmm3, %ymm3
1490 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3]
1491 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
1492 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
1493 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1494 ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rsi)
1495 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1496 ; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi)
1497 ; AVX2-FP-NEXT: vmovdqa %ymm8, 32(%rdx)
1498 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1499 ; AVX2-FP-NEXT: vmovaps %ymm3, (%rdx)
1500 ; AVX2-FP-NEXT: vmovdqa %ymm10, 32(%rcx)
1501 ; AVX2-FP-NEXT: vmovdqa %ymm9, (%rcx)
1502 ; AVX2-FP-NEXT: vmovdqa %ymm12, 32(%r8)
1503 ; AVX2-FP-NEXT: vmovdqa %ymm11, (%r8)
1504 ; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%r9)
1505 ; AVX2-FP-NEXT: vmovdqa %ymm13, (%r9)
1506 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1507 ; AVX2-FP-NEXT: vmovdqa %ymm1, 32(%rax)
1508 ; AVX2-FP-NEXT: vmovdqa %ymm4, (%rax)
1509 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1510 ; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rax)
1511 ; AVX2-FP-NEXT: vmovdqa %ymm2, (%rax)
1512 ; AVX2-FP-NEXT: vzeroupper
1513 ; AVX2-FP-NEXT: retq
1515 ; AVX2-FCP-LABEL: load_i64_stride7_vf8:
1516 ; AVX2-FCP: # %bb.0:
1517 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm4
1518 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm12
1519 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm11
1520 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm10
1521 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm9
1522 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm8
1523 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm7
1524 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm0
1525 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1526 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1527 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
1528 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm2
1529 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = mem[0,1],xmm2[2,3]
1530 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
1531 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1532 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %xmm3
1533 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm5
1534 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm5[6,7]
1535 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %xmm13
1536 ; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm5
1537 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm13[0,1],xmm5[2,3]
1538 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm6[4,5,6,7]
1539 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1540 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
1541 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm14
1542 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7]
1543 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm7[4,5,6,7]
1544 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1545 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm8 = ymm8[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
1546 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm15
1547 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
1548 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
1549 ; AVX2-FCP-NEXT: vpbroadcastq 128(%rdi), %ymm13
1550 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm13[1],ymm9[1],ymm13[3],ymm9[3]
1551 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = mem[0,1],xmm14[2,3]
1552 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
1553 ; AVX2-FCP-NEXT: vpbroadcastq 352(%rdi), %ymm13
1554 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm10 = ymm13[1],ymm10[1],ymm13[3],ymm10[3]
1555 ; AVX2-FCP-NEXT: vmovdqa 240(%rdi), %xmm13
1556 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
1557 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3],ymm10[4,5,6,7]
1558 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
1559 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm13
1560 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm14 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
1561 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
1562 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm11 = ymm11[2,3],ymm14[2,3]
1563 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
1564 ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %xmm14
1565 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7]
1566 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
1567 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm12 = ymm12[2,3],ymm15[2,3]
1568 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
1569 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3,4,5],ymm13[6,7]
1570 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm15
1571 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],mem[2,3]
1572 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3],ymm13[4,5,6,7]
1573 ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm15
1574 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
1575 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4,5],ymm14[6,7]
1576 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm1
1577 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],mem[2,3]
1578 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm14[4,5,6,7]
1579 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm14
1580 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm4 = ymm4[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
1581 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm7
1582 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm7 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
1583 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
1584 ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm7
1585 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm15 = ymm15[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
1586 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm0
1587 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
1588 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm15[4,5,6,7]
1589 ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload
1590 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm14 = ymm15[1],ymm14[1],ymm15[3],ymm14[3]
1591 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
1592 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm14[4,5,6,7]
1593 ; AVX2-FCP-NEXT: vpbroadcastq %xmm3, %ymm3
1594 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm3[1],ymm7[1],ymm3[3],ymm7[3]
1595 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
1596 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
1597 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1598 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rsi)
1599 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1600 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi)
1601 ; AVX2-FCP-NEXT: vmovdqa %ymm8, 32(%rdx)
1602 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
1603 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rdx)
1604 ; AVX2-FCP-NEXT: vmovdqa %ymm10, 32(%rcx)
1605 ; AVX2-FCP-NEXT: vmovdqa %ymm9, (%rcx)
1606 ; AVX2-FCP-NEXT: vmovdqa %ymm12, 32(%r8)
1607 ; AVX2-FCP-NEXT: vmovdqa %ymm11, (%r8)
1608 ; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%r9)
1609 ; AVX2-FCP-NEXT: vmovdqa %ymm13, (%r9)
1610 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1611 ; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%rax)
1612 ; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rax)
1613 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1614 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rax)
1615 ; AVX2-FCP-NEXT: vmovdqa %ymm2, (%rax)
1616 ; AVX2-FCP-NEXT: vzeroupper
1617 ; AVX2-FCP-NEXT: retq
1619 ; AVX512-LABEL: load_i64_stride7_vf8:
1621 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm0
1622 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm1
1623 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm2
1624 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm4
1625 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm5
1626 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm9
1627 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm10
1628 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0]
1629 ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
1630 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0]
1631 ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1632 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm3
1633 ; AVX512-NEXT: vpermt2q %zmm10, %zmm6, %zmm3
1634 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7]
1635 ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
1636 ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm7
1637 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm11
1638 ; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
1639 ; AVX512-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7
1640 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm11
1641 ; AVX512-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
1642 ; AVX512-NEXT: vextracti128 $1, %ymm11, %xmm11
1643 ; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12
1644 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm13
1645 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
1646 ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm14 = [4,11]
1647 ; AVX512-NEXT: vpermi2q %zmm9, %zmm10, %zmm14
1648 ; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm12[4,5,6,7]
1649 ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,12]
1650 ; AVX512-NEXT: vpermi2q %zmm9, %zmm10, %zmm14
1651 ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
1652 ; AVX512-NEXT: vpermi2q %zmm9, %zmm10, %zmm12
1653 ; AVX512-NEXT: vpermt2q %zmm9, %zmm8, %zmm10
1654 ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm8
1655 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13]
1656 ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
1657 ; AVX512-NEXT: vpermi2q %zmm0, %zmm8, %zmm9
1658 ; AVX512-NEXT: vinserti64x4 $0, %ymm15, %zmm9, %zmm8
1659 ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm9
1660 ; AVX512-NEXT: vpalignr {{.*#+}} ymm9 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
1661 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
1662 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi
1663 ; AVX512-NEXT: movb $24, %r10b
1664 ; AVX512-NEXT: kmovw %r10d, %k2
1665 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5]
1666 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [10,3,10,3,10,3,10,3]
1667 ; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1668 ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm13
1669 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,6,9,0,5,6,9]
1670 ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
1671 ; AVX512-NEXT: vpermi2q %zmm0, %zmm13, %zmm15
1672 ; AVX512-NEXT: movb $-32, %r10b
1673 ; AVX512-NEXT: kmovw %r10d, %k1
1674 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1}
1675 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6]
1676 ; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1677 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0]
1678 ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
1679 ; AVX512-NEXT: vpermi2q %zmm5, %zmm4, %zmm15
1680 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
1681 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11]
1682 ; AVX512-NEXT: vpermi2q %zmm4, %zmm5, %zmm14
1683 ; AVX512-NEXT: vpermt2q %zmm4, %zmm13, %zmm5
1684 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2}
1685 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
1686 ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1687 ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
1688 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10]
1689 ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
1690 ; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
1691 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1}
1692 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5]
1693 ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1694 ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
1695 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11]
1696 ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
1697 ; AVX512-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
1698 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1}
1699 ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm13
1700 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12]
1701 ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
1702 ; AVX512-NEXT: vpermi2q %zmm0, %zmm13, %zmm4
1703 ; AVX512-NEXT: vinserti32x4 $0, %xmm11, %zmm15, %zmm5
1704 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
1705 ; AVX512-NEXT: vpermi2q %zmm2, %zmm1, %zmm6
1706 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
1707 ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
1708 ; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm4
1709 ; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm4
1710 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9]
1711 ; AVX512-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1712 ; AVX512-NEXT: vpermi2q %zmm1, %zmm2, %zmm6
1713 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
1714 ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
1715 ; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm1
1716 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm14[4,5,6,7]
1717 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
1718 ; AVX512-NEXT: vmovdqa64 %zmm10, (%rsi)
1719 ; AVX512-NEXT: vmovdqa64 %zmm3, (%rdx)
1720 ; AVX512-NEXT: vmovdqa64 %zmm7, (%rcx)
1721 ; AVX512-NEXT: vmovdqa64 %zmm5, (%r8)
1722 ; AVX512-NEXT: vmovdqa64 %zmm8, (%r9)
1723 ; AVX512-NEXT: vmovdqa64 %zmm4, (%rdi)
1724 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rax)
1725 ; AVX512-NEXT: vzeroupper
1728 ; AVX512-FCP-LABEL: load_i64_stride7_vf8:
1729 ; AVX512-FCP: # %bb.0:
1730 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0
1731 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm1
1732 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm2
1733 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4
1734 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5
1735 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9
1736 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm10
1737 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0]
1738 ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
1739 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0]
1740 ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1741 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm3
1742 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm3
1743 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7]
1744 ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
1745 ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm7
1746 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm11
1747 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
1748 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7
1749 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm11
1750 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
1751 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
1752 ; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12
1753 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm13
1754 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
1755 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [4,11]
1756 ; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm14
1757 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm12[4,5,6,7]
1758 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,12]
1759 ; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm14
1760 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
1761 ; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm12
1762 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm10
1763 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8
1764 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13]
1765 ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
1766 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm9
1767 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm9, %zmm8
1768 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm9
1769 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
1770 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1771 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi
1772 ; AVX512-FCP-NEXT: movb $24, %r10b
1773 ; AVX512-FCP-NEXT: kmovw %r10d, %k2
1774 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5]
1775 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [10,3,10,3,10,3,10,3]
1776 ; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1777 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13
1778 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,6,9,0,5,6,9]
1779 ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
1780 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm15
1781 ; AVX512-FCP-NEXT: movb $-32, %r10b
1782 ; AVX512-FCP-NEXT: kmovw %r10d, %k1
1783 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1}
1784 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6]
1785 ; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1786 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0]
1787 ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
1788 ; AVX512-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm15
1789 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
1790 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11]
1791 ; AVX512-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14
1792 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm5
1793 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2}
1794 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
1795 ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1796 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
1797 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10]
1798 ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
1799 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
1800 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1}
1801 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5]
1802 ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1803 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
1804 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11]
1805 ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
1806 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
1807 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1}
1808 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13
1809 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12]
1810 ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
1811 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm4
1812 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm15, %zmm5
1813 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
1814 ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6
1815 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
1816 ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
1817 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4
1818 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm4
1819 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9]
1820 ; AVX512-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1821 ; AVX512-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6
1822 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
1823 ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
1824 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm1
1825 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm14[4,5,6,7]
1826 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
1827 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rsi)
1828 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, (%rdx)
1829 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rcx)
1830 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%r8)
1831 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%r9)
1832 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rdi)
1833 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
1834 ; AVX512-FCP-NEXT: vzeroupper
1835 ; AVX512-FCP-NEXT: retq
1837 ; AVX512DQ-LABEL: load_i64_stride7_vf8:
1838 ; AVX512DQ: # %bb.0:
1839 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm0
1840 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm1
1841 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm2
1842 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm4
1843 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm5
1844 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm9
1845 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm10
1846 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0]
1847 ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
1848 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0]
1849 ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1850 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm3
1851 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm6, %zmm3
1852 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7]
1853 ; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
1854 ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm7
1855 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm11
1856 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
1857 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7
1858 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm11
1859 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
1860 ; AVX512DQ-NEXT: vextracti128 $1, %ymm11, %xmm11
1861 ; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12
1862 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm13
1863 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
1864 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm14 = [4,11]
1865 ; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm10, %zmm14
1866 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm12[4,5,6,7]
1867 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,12]
1868 ; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm10, %zmm14
1869 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
1870 ; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm10, %zmm12
1871 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm8, %zmm10
1872 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm8
1873 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13]
1874 ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
1875 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm8, %zmm9
1876 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm15, %zmm9, %zmm8
1877 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm9
1878 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm9 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
1879 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
1880 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rdi
1881 ; AVX512DQ-NEXT: movb $24, %r10b
1882 ; AVX512DQ-NEXT: kmovw %r10d, %k2
1883 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5]
1884 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [10,3,10,3,10,3,10,3]
1885 ; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1886 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm13
1887 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,6,9,0,5,6,9]
1888 ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
1889 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm13, %zmm15
1890 ; AVX512DQ-NEXT: movb $-32, %r10b
1891 ; AVX512DQ-NEXT: kmovw %r10d, %k1
1892 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1}
1893 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6]
1894 ; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1895 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0]
1896 ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
1897 ; AVX512DQ-NEXT: vpermi2q %zmm5, %zmm4, %zmm15
1898 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
1899 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11]
1900 ; AVX512DQ-NEXT: vpermi2q %zmm4, %zmm5, %zmm14
1901 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm13, %zmm5
1902 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2}
1903 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
1904 ; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1905 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
1906 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10]
1907 ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
1908 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
1909 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1}
1910 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5]
1911 ; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1912 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
1913 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11]
1914 ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
1915 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
1916 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1}
1917 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm13
1918 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12]
1919 ; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
1920 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm13, %zmm4
1921 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm11, %zmm15, %zmm5
1922 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
1923 ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm1, %zmm6
1924 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
1925 ; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
1926 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm6, %zmm4
1927 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm4
1928 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9]
1929 ; AVX512DQ-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1930 ; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm2, %zmm6
1931 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
1932 ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
1933 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm6, %zmm1
1934 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm14[4,5,6,7]
1935 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
1936 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rsi)
1937 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rdx)
1938 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rcx)
1939 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%r8)
1940 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%r9)
1941 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%rdi)
1942 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax)
1943 ; AVX512DQ-NEXT: vzeroupper
1944 ; AVX512DQ-NEXT: retq
1946 ; AVX512DQ-FCP-LABEL: load_i64_stride7_vf8:
1947 ; AVX512DQ-FCP: # %bb.0:
1948 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0
1949 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm1
1950 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm2
1951 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4
1952 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5
1953 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9
1954 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm10
1955 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0]
1956 ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
1957 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0]
1958 ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
1959 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm3
1960 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm3
1961 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7]
1962 ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
1963 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm7
1964 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm11
1965 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
1966 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7
1967 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm11
1968 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm11 = mem[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
1969 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11
1970 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12
1971 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm13
1972 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
1973 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [4,11]
1974 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm14
1975 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm14[0,1,2,3],ymm12[4,5,6,7]
1976 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm14 = [5,12]
1977 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm14
1978 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
1979 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm12
1980 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm10
1981 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8
1982 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13]
1983 ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
1984 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm9
1985 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm15, %zmm9, %zmm8
1986 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm9
1987 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
1988 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1989 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi
1990 ; AVX512DQ-FCP-NEXT: movb $24, %r10b
1991 ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k2
1992 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5]
1993 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [10,3,10,3,10,3,10,3]
1994 ; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
1995 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13
1996 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [0,5,6,9,0,5,6,9]
1997 ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
1998 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm15
1999 ; AVX512DQ-FCP-NEXT: movb $-32, %r10b
2000 ; AVX512DQ-FCP-NEXT: kmovw %r10d, %k1
2001 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1}
2002 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [13,6,13,6,13,6,13,6]
2003 ; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2004 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0]
2005 ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
2006 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm15
2007 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3],ymm9[4,5,6,7]
2008 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm14 = [0,0,4,11]
2009 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm14
2010 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm13, %zmm5
2011 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2}
2012 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
2013 ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2014 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
2015 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10]
2016 ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
2017 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
2018 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1}
2019 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5]
2020 ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2021 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
2022 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11]
2023 ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
2024 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
2025 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1}
2026 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm13
2027 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12]
2028 ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
2029 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm13, %zmm4
2030 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm15, %zmm5
2031 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
2032 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6
2033 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
2034 ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
2035 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4
2036 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm4, %zmm4
2037 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9]
2038 ; AVX512DQ-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2039 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6
2040 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
2041 ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
2042 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm1
2043 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm14[4,5,6,7]
2044 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
2045 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rsi)
2046 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, (%rdx)
2047 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rcx)
2048 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%r8)
2049 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%r9)
2050 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rdi)
2051 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
2052 ; AVX512DQ-FCP-NEXT: vzeroupper
2053 ; AVX512DQ-FCP-NEXT: retq
2055 ; AVX512BW-LABEL: load_i64_stride7_vf8:
2056 ; AVX512BW: # %bb.0:
2057 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0
2058 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm1
2059 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm2
2060 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4
2061 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm5
2062 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm9
2063 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm10
2064 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0]
2065 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
2066 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0]
2067 ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
2068 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3
2069 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm3
2070 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7]
2071 ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
2072 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm7
2073 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm11
2074 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
2075 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7
2076 ; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12
2077 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm11
2078 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7]
2079 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11]
2080 ; AVX512BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm13
2081 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7]
2082 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12]
2083 ; AVX512BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm13
2084 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
2085 ; AVX512BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm12
2086 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm8, %zmm10
2087 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm8
2088 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13]
2089 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
2090 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm9
2091 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm9, %zmm8
2092 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
2093 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
2094 ; AVX512BW-NEXT: movb $24, %r11b
2095 ; AVX512BW-NEXT: kmovd %r11d, %k2
2096 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5]
2097 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,3,10,3,10,3,10,3]
2098 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2099 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm9
2100 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,6,9,0,5,6,9]
2101 ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
2102 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm14
2103 ; AVX512BW-NEXT: movb $-32, %r11b
2104 ; AVX512BW-NEXT: kmovd %r11d, %k1
2105 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm10 {%k1}
2106 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6]
2107 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2108 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,0,9,0,7,0,9,0]
2109 ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
2110 ; AVX512BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm14
2111 ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm15
2112 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
2113 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
2114 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,11]
2115 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm13
2116 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm5
2117 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2}
2118 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
2119 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2120 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
2121 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10]
2122 ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
2123 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
2124 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1}
2125 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5]
2126 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2127 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
2128 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11]
2129 ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
2130 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
2131 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1}
2132 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm9
2133 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12]
2134 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
2135 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm4
2136 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm5
2137 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
2138 ; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5
2139 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm14, %zmm5
2140 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
2141 ; AVX512BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm6
2142 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
2143 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
2144 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm4
2145 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4
2146 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9]
2147 ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2148 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm6
2149 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
2150 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
2151 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm1
2152 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm13[4,5,6,7]
2153 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
2154 ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rsi)
2155 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rdx)
2156 ; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rcx)
2157 ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r8)
2158 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%r9)
2159 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r10)
2160 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax)
2161 ; AVX512BW-NEXT: vzeroupper
2162 ; AVX512BW-NEXT: retq
2164 ; AVX512BW-FCP-LABEL: load_i64_stride7_vf8:
2165 ; AVX512BW-FCP: # %bb.0:
2166 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0
2167 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm1
2168 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm2
2169 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4
2170 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5
2171 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9
2172 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10
2173 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0]
2174 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
2175 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0]
2176 ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
2177 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3
2178 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm3
2179 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7]
2180 ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
2181 ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm7
2182 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm11
2183 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
2184 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7
2185 ; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12
2186 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
2187 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7]
2188 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11]
2189 ; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm13
2190 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7]
2191 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12]
2192 ; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm13
2193 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
2194 ; AVX512BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm12
2195 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm10
2196 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8
2197 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13]
2198 ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
2199 ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm9
2200 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm9, %zmm8
2201 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2202 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
2203 ; AVX512BW-FCP-NEXT: movb $24, %r11b
2204 ; AVX512BW-FCP-NEXT: kmovd %r11d, %k2
2205 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5]
2206 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,3,10,3,10,3,10,3]
2207 ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2208 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm9
2209 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,6,9,0,5,6,9]
2210 ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
2211 ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm14
2212 ; AVX512BW-FCP-NEXT: movb $-32, %r11b
2213 ; AVX512BW-FCP-NEXT: kmovd %r11d, %k1
2214 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 {%k1}
2215 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6]
2216 ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2217 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,0,9,0,7,0,9,0]
2218 ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
2219 ; AVX512BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm14
2220 ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm15
2221 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
2222 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
2223 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,11]
2224 ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm13
2225 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm5
2226 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2}
2227 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
2228 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2229 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
2230 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10]
2231 ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
2232 ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
2233 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1}
2234 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5]
2235 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2236 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
2237 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11]
2238 ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
2239 ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
2240 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1}
2241 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm9
2242 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12]
2243 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
2244 ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm4
2245 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm5
2246 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
2247 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
2248 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm14, %zmm5
2249 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
2250 ; AVX512BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6
2251 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
2252 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
2253 ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4
2254 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4
2255 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9]
2256 ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2257 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6
2258 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
2259 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
2260 ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm1
2261 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm13[4,5,6,7]
2262 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
2263 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rsi)
2264 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx)
2265 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rcx)
2266 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%r8)
2267 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%r9)
2268 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10)
2269 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
2270 ; AVX512BW-FCP-NEXT: vzeroupper
2271 ; AVX512BW-FCP-NEXT: retq
2273 ; AVX512DQ-BW-LABEL: load_i64_stride7_vf8:
2274 ; AVX512DQ-BW: # %bb.0:
2275 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm0
2276 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm1
2277 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm2
2278 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm4
2279 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm5
2280 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm9
2281 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm10
2282 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0]
2283 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
2284 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0]
2285 ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
2286 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3
2287 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm6, %zmm3
2288 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7]
2289 ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
2290 ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm7
2291 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm11
2292 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
2293 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7
2294 ; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12
2295 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm11
2296 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7]
2297 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11]
2298 ; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm13
2299 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7]
2300 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12]
2301 ; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm13
2302 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
2303 ; AVX512DQ-BW-NEXT: vpermi2q %zmm9, %zmm10, %zmm12
2304 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm8, %zmm10
2305 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm8
2306 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13]
2307 ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
2308 ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm8, %zmm9
2309 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm14, %zmm9, %zmm8
2310 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
2311 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
2312 ; AVX512DQ-BW-NEXT: movb $24, %r11b
2313 ; AVX512DQ-BW-NEXT: kmovd %r11d, %k2
2314 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5]
2315 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,3,10,3,10,3,10,3]
2316 ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2317 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm9
2318 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,6,9,0,5,6,9]
2319 ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
2320 ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm14
2321 ; AVX512DQ-BW-NEXT: movb $-32, %r11b
2322 ; AVX512DQ-BW-NEXT: kmovd %r11d, %k1
2323 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm10 {%k1}
2324 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6]
2325 ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2326 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,0,9,0,7,0,9,0]
2327 ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
2328 ; AVX512DQ-BW-NEXT: vpermi2q %zmm5, %zmm4, %zmm14
2329 ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm15
2330 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
2331 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
2332 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,11]
2333 ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm5, %zmm13
2334 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm5
2335 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2}
2336 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
2337 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2338 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
2339 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10]
2340 ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
2341 ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
2342 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1}
2343 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5]
2344 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2345 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
2346 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11]
2347 ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
2348 ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
2349 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1}
2350 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm9
2351 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12]
2352 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
2353 ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm9, %zmm4
2354 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm5
2355 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
2356 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm5
2357 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm14, %zmm5
2358 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
2359 ; AVX512DQ-BW-NEXT: vpermi2q %zmm2, %zmm1, %zmm6
2360 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
2361 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
2362 ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm4
2363 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4
2364 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9]
2365 ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2366 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm2, %zmm6
2367 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
2368 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
2369 ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm1
2370 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm13[4,5,6,7]
2371 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
2372 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rsi)
2373 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rdx)
2374 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rcx)
2375 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%r8)
2376 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%r9)
2377 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%r10)
2378 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax)
2379 ; AVX512DQ-BW-NEXT: vzeroupper
2380 ; AVX512DQ-BW-NEXT: retq
2382 ; AVX512DQ-BW-FCP-LABEL: load_i64_stride7_vf8:
2383 ; AVX512DQ-BW-FCP: # %bb.0:
2384 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0
2385 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm1
2386 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm2
2387 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4
2388 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5
2389 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm9
2390 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm10
2391 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,7,14,0,0,7,14,0]
2392 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
2393 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [9,0,7,0,9,0,7,0]
2394 ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
2395 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3
2396 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm6, %zmm3
2397 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [14,0,0,7,14,0,0,7]
2398 ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
2399 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm7
2400 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm11
2401 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
2402 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm7, %zmm7
2403 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm12
2404 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
2405 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm11[0,1,2,3,4,5],ymm12[6,7]
2406 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [4,11]
2407 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm13
2408 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm12[4,5,6,7]
2409 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm13 = [5,12]
2410 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm13
2411 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
2412 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm9, %zmm10, %zmm12
2413 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm8, %zmm10
2414 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm8
2415 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [4,5,6,13,4,5,6,13]
2416 ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
2417 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm8, %zmm9
2418 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm9, %zmm8
2419 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2420 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
2421 ; AVX512DQ-BW-FCP-NEXT: movb $24, %r11b
2422 ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k2
2423 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm5[4,5,4,5],zmm4[4,5,4,5]
2424 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [10,3,10,3,10,3,10,3]
2425 ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2426 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm9
2427 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [0,5,6,9,0,5,6,9]
2428 ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
2429 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm14
2430 ; AVX512DQ-BW-FCP-NEXT: movb $-32, %r11b
2431 ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1
2432 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm10 {%k1}
2433 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [13,6,13,6,13,6,13,6]
2434 ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2435 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [7,0,9,0,7,0,9,0]
2436 ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
2437 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm5, %zmm4, %zmm14
2438 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm15
2439 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
2440 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3],ymm11[4,5,6,7]
2441 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm13 = [0,0,4,11]
2442 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm5, %zmm13
2443 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm5
2444 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k2}
2445 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [11,4,11,4,11,4,11,4]
2446 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2447 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
2448 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,10,0,5,6,10]
2449 ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
2450 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
2451 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3 {%k1}
2452 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [12,5,12,5,12,5,12,5]
2453 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2454 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm4
2455 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [0,5,6,11,0,5,6,11]
2456 ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
2457 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm4, %zmm5
2458 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1}
2459 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm9
2460 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,5,6,12,0,5,6,12]
2461 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
2462 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm9, %zmm4
2463 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm5
2464 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
2465 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
2466 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm14, %zmm5
2467 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
2468 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm2, %zmm1, %zmm6
2469 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
2470 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
2471 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm4
2472 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm4, %zmm4
2473 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [2,9,2,9,2,9,2,9]
2474 ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
2475 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm2, %zmm6
2476 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
2477 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
2478 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm1
2479 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm13[4,5,6,7]
2480 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
2481 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rsi)
2482 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rdx)
2483 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rcx)
2484 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%r8)
2485 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%r9)
2486 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10)
2487 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
2488 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
2489 ; AVX512DQ-BW-FCP-NEXT: retq
2490 %wide.vec = load <56 x i64>, ptr %in.vec, align 64
2491 %strided.vec0 = shufflevector <56 x i64> %wide.vec, <56 x i64> poison, <8 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49>
2492 %strided.vec1 = shufflevector <56 x i64> %wide.vec, <56 x i64> poison, <8 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50>
2493 %strided.vec2 = shufflevector <56 x i64> %wide.vec, <56 x i64> poison, <8 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51>
2494 %strided.vec3 = shufflevector <56 x i64> %wide.vec, <56 x i64> poison, <8 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52>
2495 %strided.vec4 = shufflevector <56 x i64> %wide.vec, <56 x i64> poison, <8 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53>
2496 %strided.vec5 = shufflevector <56 x i64> %wide.vec, <56 x i64> poison, <8 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54>
2497 %strided.vec6 = shufflevector <56 x i64> %wide.vec, <56 x i64> poison, <8 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55>
2498 store <8 x i64> %strided.vec0, ptr %out.vec0, align 64
2499 store <8 x i64> %strided.vec1, ptr %out.vec1, align 64
2500 store <8 x i64> %strided.vec2, ptr %out.vec2, align 64
2501 store <8 x i64> %strided.vec3, ptr %out.vec3, align 64
2502 store <8 x i64> %strided.vec4, ptr %out.vec4, align 64
2503 store <8 x i64> %strided.vec5, ptr %out.vec5, align 64
2504 store <8 x i64> %strided.vec6, ptr %out.vec6, align 64
2508 define void @load_i64_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
2509 ; SSE-LABEL: load_i64_stride7_vf16:
2511 ; SSE-NEXT: subq $536, %rsp # imm = 0x218
2512 ; SSE-NEXT: movapd 208(%rdi), %xmm3
2513 ; SSE-NEXT: movapd 96(%rdi), %xmm2
2514 ; SSE-NEXT: movapd 144(%rdi), %xmm4
2515 ; SSE-NEXT: movapd 192(%rdi), %xmm6
2516 ; SSE-NEXT: movapd 80(%rdi), %xmm5
2517 ; SSE-NEXT: movapd 128(%rdi), %xmm8
2518 ; SSE-NEXT: movapd 64(%rdi), %xmm10
2519 ; SSE-NEXT: movapd 176(%rdi), %xmm11
2520 ; SSE-NEXT: movapd (%rdi), %xmm12
2521 ; SSE-NEXT: movapd 16(%rdi), %xmm9
2522 ; SSE-NEXT: movapd 32(%rdi), %xmm7
2523 ; SSE-NEXT: movapd 48(%rdi), %xmm0
2524 ; SSE-NEXT: movapd 224(%rdi), %xmm13
2525 ; SSE-NEXT: movapd 112(%rdi), %xmm14
2526 ; SSE-NEXT: movapd 160(%rdi), %xmm1
2527 ; SSE-NEXT: movapd %xmm0, %xmm15
2528 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm12[0],xmm15[1]
2529 ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2530 ; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm10[0]
2531 ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2532 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm9[0],xmm10[1]
2533 ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2534 ; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm5[0]
2535 ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2536 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1]
2537 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2538 ; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm2[0]
2539 ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2540 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2541 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2542 ; SSE-NEXT: movapd %xmm1, %xmm0
2543 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1]
2544 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2545 ; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm11[0]
2546 ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2547 ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1]
2548 ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2549 ; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm6[0]
2550 ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2551 ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1]
2552 ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2553 ; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm3[0]
2554 ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2555 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
2556 ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2557 ; SSE-NEXT: movapd 272(%rdi), %xmm0
2558 ; SSE-NEXT: movapd %xmm0, %xmm1
2559 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1]
2560 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2561 ; SSE-NEXT: movapd 288(%rdi), %xmm1
2562 ; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm1[0]
2563 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2564 ; SSE-NEXT: movapd 240(%rdi), %xmm2
2565 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2566 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2567 ; SSE-NEXT: movapd 304(%rdi), %xmm1
2568 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
2569 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2570 ; SSE-NEXT: movapd 256(%rdi), %xmm2
2571 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2572 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2573 ; SSE-NEXT: movapd 320(%rdi), %xmm1
2574 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
2575 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2576 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2577 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2578 ; SSE-NEXT: movapd 336(%rdi), %xmm2
2579 ; SSE-NEXT: movapd 384(%rdi), %xmm0
2580 ; SSE-NEXT: movapd %xmm0, %xmm1
2581 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2582 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2583 ; SSE-NEXT: movapd 400(%rdi), %xmm1
2584 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
2585 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2586 ; SSE-NEXT: movapd 352(%rdi), %xmm2
2587 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2588 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2589 ; SSE-NEXT: movapd 416(%rdi), %xmm1
2590 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
2591 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2592 ; SSE-NEXT: movapd 368(%rdi), %xmm2
2593 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2594 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2595 ; SSE-NEXT: movapd 432(%rdi), %xmm1
2596 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
2597 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2598 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2599 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2600 ; SSE-NEXT: movapd 448(%rdi), %xmm2
2601 ; SSE-NEXT: movapd 496(%rdi), %xmm0
2602 ; SSE-NEXT: movapd %xmm0, %xmm1
2603 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2604 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2605 ; SSE-NEXT: movapd 512(%rdi), %xmm1
2606 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
2607 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2608 ; SSE-NEXT: movapd 464(%rdi), %xmm2
2609 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2610 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2611 ; SSE-NEXT: movapd 528(%rdi), %xmm1
2612 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
2613 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2614 ; SSE-NEXT: movapd 480(%rdi), %xmm2
2615 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2616 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2617 ; SSE-NEXT: movapd 544(%rdi), %xmm1
2618 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
2619 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2620 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2621 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2622 ; SSE-NEXT: movapd 560(%rdi), %xmm13
2623 ; SSE-NEXT: movapd 608(%rdi), %xmm0
2624 ; SSE-NEXT: movapd %xmm0, %xmm1
2625 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1]
2626 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2627 ; SSE-NEXT: movapd 624(%rdi), %xmm14
2628 ; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm14[0]
2629 ; SSE-NEXT: movapd 576(%rdi), %xmm2
2630 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1]
2631 ; SSE-NEXT: movapd 640(%rdi), %xmm1
2632 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
2633 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2634 ; SSE-NEXT: movapd 592(%rdi), %xmm2
2635 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
2636 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2637 ; SSE-NEXT: movapd 656(%rdi), %xmm1
2638 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
2639 ; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill
2640 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2641 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2642 ; SSE-NEXT: movapd 672(%rdi), %xmm6
2643 ; SSE-NEXT: movapd 720(%rdi), %xmm0
2644 ; SSE-NEXT: movapd %xmm0, %xmm7
2645 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1]
2646 ; SSE-NEXT: movapd 736(%rdi), %xmm8
2647 ; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm8[0]
2648 ; SSE-NEXT: movapd 688(%rdi), %xmm10
2649 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm10[0],xmm8[1]
2650 ; SSE-NEXT: movapd 752(%rdi), %xmm12
2651 ; SSE-NEXT: shufpd {{.*#+}} xmm10 = xmm10[1],xmm12[0]
2652 ; SSE-NEXT: movapd 704(%rdi), %xmm15
2653 ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm15[0],xmm12[1]
2654 ; SSE-NEXT: movapd 768(%rdi), %xmm1
2655 ; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm1[0]
2656 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
2657 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2658 ; SSE-NEXT: movapd 784(%rdi), %xmm0
2659 ; SSE-NEXT: movapd 832(%rdi), %xmm4
2660 ; SSE-NEXT: movapd %xmm4, %xmm2
2661 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
2662 ; SSE-NEXT: movapd 848(%rdi), %xmm1
2663 ; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
2664 ; SSE-NEXT: movapd 800(%rdi), %xmm3
2665 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
2666 ; SSE-NEXT: movapd 864(%rdi), %xmm5
2667 ; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm5[0]
2668 ; SSE-NEXT: movapd 816(%rdi), %xmm9
2669 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm9[0],xmm5[1]
2670 ; SSE-NEXT: movapd 880(%rdi), %xmm11
2671 ; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm11[0]
2672 ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm4[0],xmm11[1]
2673 ; SSE-NEXT: movapd %xmm7, 96(%rsi)
2674 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2675 ; SSE-NEXT: movaps %xmm4, 32(%rsi)
2676 ; SSE-NEXT: movapd %xmm2, 112(%rsi)
2677 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2678 ; SSE-NEXT: movaps %xmm2, 48(%rsi)
2679 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2680 ; SSE-NEXT: movaps %xmm2, 64(%rsi)
2681 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2682 ; SSE-NEXT: movaps %xmm2, (%rsi)
2683 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2684 ; SSE-NEXT: movaps %xmm2, 80(%rsi)
2685 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2686 ; SSE-NEXT: movaps %xmm2, 16(%rsi)
2687 ; SSE-NEXT: movapd %xmm6, 96(%rdx)
2688 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2689 ; SSE-NEXT: movaps %xmm2, 32(%rdx)
2690 ; SSE-NEXT: movapd %xmm0, 112(%rdx)
2691 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2692 ; SSE-NEXT: movaps %xmm0, 48(%rdx)
2693 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2694 ; SSE-NEXT: movaps %xmm0, 64(%rdx)
2695 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2696 ; SSE-NEXT: movaps %xmm0, (%rdx)
2697 ; SSE-NEXT: movapd %xmm13, 80(%rdx)
2698 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2699 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
2700 ; SSE-NEXT: movapd %xmm8, 96(%rcx)
2701 ; SSE-NEXT: movapd %xmm1, 112(%rcx)
2702 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2703 ; SSE-NEXT: movaps %xmm0, 64(%rcx)
2704 ; SSE-NEXT: movapd %xmm14, 80(%rcx)
2705 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2706 ; SSE-NEXT: movaps %xmm0, 32(%rcx)
2707 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2708 ; SSE-NEXT: movaps %xmm0, 48(%rcx)
2709 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2710 ; SSE-NEXT: movaps %xmm0, (%rcx)
2711 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2712 ; SSE-NEXT: movaps %xmm0, 16(%rcx)
2713 ; SSE-NEXT: movapd %xmm3, 112(%r8)
2714 ; SSE-NEXT: movapd %xmm10, 96(%r8)
2715 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2716 ; SSE-NEXT: movaps %xmm0, 80(%r8)
2717 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2718 ; SSE-NEXT: movaps %xmm0, 64(%r8)
2719 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2720 ; SSE-NEXT: movaps %xmm0, 48(%r8)
2721 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2722 ; SSE-NEXT: movaps %xmm0, 32(%r8)
2723 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2724 ; SSE-NEXT: movaps %xmm0, 16(%r8)
2725 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2726 ; SSE-NEXT: movaps %xmm0, (%r8)
2727 ; SSE-NEXT: movapd %xmm5, 112(%r9)
2728 ; SSE-NEXT: movapd %xmm12, 96(%r9)
2729 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2730 ; SSE-NEXT: movaps %xmm0, 80(%r9)
2731 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2732 ; SSE-NEXT: movaps %xmm0, 64(%r9)
2733 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2734 ; SSE-NEXT: movaps %xmm0, 48(%r9)
2735 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2736 ; SSE-NEXT: movaps %xmm0, 32(%r9)
2737 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2738 ; SSE-NEXT: movaps %xmm0, 16(%r9)
2739 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2740 ; SSE-NEXT: movaps %xmm0, (%r9)
2741 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
2742 ; SSE-NEXT: movapd %xmm9, 112(%rax)
2743 ; SSE-NEXT: movapd %xmm15, 96(%rax)
2744 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
2745 ; SSE-NEXT: movaps %xmm0, 80(%rax)
2746 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2747 ; SSE-NEXT: movaps %xmm0, 64(%rax)
2748 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2749 ; SSE-NEXT: movaps %xmm0, 48(%rax)
2750 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2751 ; SSE-NEXT: movaps %xmm0, 32(%rax)
2752 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2753 ; SSE-NEXT: movaps %xmm0, 16(%rax)
2754 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2755 ; SSE-NEXT: movaps %xmm0, (%rax)
2756 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
2757 ; SSE-NEXT: movapd %xmm11, 112(%rax)
2758 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2759 ; SSE-NEXT: movaps %xmm0, 96(%rax)
2760 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2761 ; SSE-NEXT: movaps %xmm0, 80(%rax)
2762 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2763 ; SSE-NEXT: movaps %xmm0, 64(%rax)
2764 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2765 ; SSE-NEXT: movaps %xmm0, 48(%rax)
2766 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2767 ; SSE-NEXT: movaps %xmm0, 32(%rax)
2768 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2769 ; SSE-NEXT: movaps %xmm0, 16(%rax)
2770 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2771 ; SSE-NEXT: movaps %xmm0, (%rax)
2772 ; SSE-NEXT: addq $536, %rsp # imm = 0x218
2775 ; AVX-LABEL: load_i64_stride7_vf16:
2777 ; AVX-NEXT: subq $552, %rsp # imm = 0x228
2778 ; AVX-NEXT: vmovapd 544(%rdi), %ymm0
2779 ; AVX-NEXT: vmovapd 96(%rdi), %ymm1
2780 ; AVX-NEXT: vmovaps 768(%rdi), %ymm2
2781 ; AVX-NEXT: vmovaps 320(%rdi), %ymm3
2782 ; AVX-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm4
2783 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2784 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm4[6,7]
2785 ; AVX-NEXT: vmovaps 224(%rdi), %xmm5
2786 ; AVX-NEXT: vmovaps 272(%rdi), %xmm6
2787 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2788 ; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1],xmm6[2,3]
2789 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
2790 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2791 ; AVX-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm4
2792 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2793 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm4[6,7]
2794 ; AVX-NEXT: vmovaps 672(%rdi), %xmm6
2795 ; AVX-NEXT: vmovaps 720(%rdi), %xmm7
2796 ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2797 ; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3]
2798 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
2799 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2800 ; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm4
2801 ; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2802 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3]
2803 ; AVX-NEXT: vmovapd 48(%rdi), %xmm7
2804 ; AVX-NEXT: vmovapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2805 ; AVX-NEXT: vblendpd {{.*#+}} xmm7 = mem[0],xmm7[1]
2806 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3]
2807 ; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2808 ; AVX-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm4
2809 ; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2810 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3]
2811 ; AVX-NEXT: vmovapd 448(%rdi), %xmm7
2812 ; AVX-NEXT: vmovapd 496(%rdi), %xmm8
2813 ; AVX-NEXT: vmovapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2814 ; AVX-NEXT: vblendpd {{.*#+}} xmm8 = xmm7[0],xmm8[1]
2815 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm8[0,1],ymm4[2,3]
2816 ; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2817 ; AVX-NEXT: vmovdqa 288(%rdi), %xmm4
2818 ; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm5[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
2819 ; AVX-NEXT: vmovapd 384(%rdi), %ymm5
2820 ; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[3],ymm5[2]
2821 ; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3]
2822 ; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2823 ; AVX-NEXT: vmovdqa 736(%rdi), %xmm3
2824 ; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm6[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
2825 ; AVX-NEXT: vmovapd 832(%rdi), %ymm4
2826 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm4[1],ymm2[3],ymm4[2]
2827 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
2828 ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2829 ; AVX-NEXT: vmovapd 160(%rdi), %ymm2
2830 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
2831 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm6
2832 ; AVX-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
2833 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1],ymm1[2,3]
2834 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2835 ; AVX-NEXT: vmovapd 608(%rdi), %ymm8
2836 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[3],ymm8[2]
2837 ; AVX-NEXT: vmovdqa 512(%rdi), %xmm9
2838 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm7[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
2839 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
2840 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2841 ; AVX-NEXT: vmovdqa 352(%rdi), %xmm1
2842 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
2843 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3]
2844 ; AVX-NEXT: vmovapd 240(%rdi), %xmm3
2845 ; AVX-NEXT: vblendpd {{.*#+}} xmm5 = xmm3[0],mem[1]
2846 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm5[0,1],ymm0[2,3]
2847 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2848 ; AVX-NEXT: vmovdqa 800(%rdi), %xmm7
2849 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
2850 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3]
2851 ; AVX-NEXT: vmovapd 688(%rdi), %xmm5
2852 ; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm5[0],mem[1]
2853 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3]
2854 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2855 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm4
2856 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
2857 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3]
2858 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm10
2859 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm6[4,5,6,7]
2860 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
2861 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2862 ; AVX-NEXT: vmovdqa 576(%rdi), %xmm2
2863 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
2864 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3]
2865 ; AVX-NEXT: vmovdqa 464(%rdi), %xmm6
2866 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm9[4,5,6,7]
2867 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm0[2,3]
2868 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2869 ; AVX-NEXT: vmovapd 752(%rdi), %xmm12
2870 ; AVX-NEXT: vshufpd {{.*#+}} ymm8 = ymm5[1],ymm12[0],ymm5[2],ymm12[3]
2871 ; AVX-NEXT: vmovdqa 864(%rdi), %xmm5
2872 ; AVX-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
2873 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
2874 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1],ymm7[2,3]
2875 ; AVX-NEXT: vmovupd %ymm0, (%rsp) # 32-byte Spill
2876 ; AVX-NEXT: vmovapd 304(%rdi), %xmm7
2877 ; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[3]
2878 ; AVX-NEXT: vmovdqa 416(%rdi), %xmm8
2879 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
2880 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
2881 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm1[2,3]
2882 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2883 ; AVX-NEXT: vmovapd 80(%rdi), %xmm11
2884 ; AVX-NEXT: vshufpd {{.*#+}} ymm9 = ymm10[1],ymm11[0],ymm10[2],ymm11[3]
2885 ; AVX-NEXT: vmovdqa 192(%rdi), %xmm10
2886 ; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
2887 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
2888 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1],ymm4[2,3]
2889 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2890 ; AVX-NEXT: vmovapd 528(%rdi), %xmm4
2891 ; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm4[0],ymm6[2],ymm4[3]
2892 ; AVX-NEXT: vmovdqa 640(%rdi), %xmm0
2893 ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
2894 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
2895 ; AVX-NEXT: vblendpd {{.*#+}} ymm13 = ymm6[0,1],ymm2[2,3]
2896 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm2
2897 ; AVX-NEXT: vmovapd 352(%rdi), %ymm6
2898 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3]
2899 ; AVX-NEXT: vmovapd 256(%rdi), %xmm8
2900 ; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm8[0],xmm7[1]
2901 ; AVX-NEXT: vblendpd {{.*#+}} ymm14 = ymm7[0,1],ymm2[2,3]
2902 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm2
2903 ; AVX-NEXT: vmovapd 800(%rdi), %ymm3
2904 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3]
2905 ; AVX-NEXT: vmovapd 704(%rdi), %xmm5
2906 ; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm5[0],xmm12[1]
2907 ; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm7[0,1],ymm2[2,3]
2908 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2909 ; AVX-NEXT: vmovapd 576(%rdi), %ymm7
2910 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3]
2911 ; AVX-NEXT: vmovapd 480(%rdi), %xmm1
2912 ; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm1[0],xmm4[1]
2913 ; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm4[0,1],ymm0[2,3]
2914 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm0
2915 ; AVX-NEXT: vmovapd 128(%rdi), %ymm10
2916 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1,2],ymm0[3]
2917 ; AVX-NEXT: vblendpd {{.*#+}} xmm4 = mem[0],xmm11[1]
2918 ; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm4[0,1],ymm0[2,3]
2919 ; AVX-NEXT: vmovapd 416(%rdi), %ymm2
2920 ; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm6[0],ymm2[1],ymm6[3],ymm2[2]
2921 ; AVX-NEXT: vmovdqa 320(%rdi), %xmm6
2922 ; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
2923 ; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm8[0,1],ymm4[2,3]
2924 ; AVX-NEXT: vmovapd 864(%rdi), %ymm4
2925 ; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[3],ymm4[2]
2926 ; AVX-NEXT: vmovdqa 768(%rdi), %xmm0
2927 ; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
2928 ; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm3[2,3]
2929 ; AVX-NEXT: vmovdqa 544(%rdi), %xmm3
2930 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
2931 ; AVX-NEXT: vmovapd 640(%rdi), %ymm3
2932 ; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm3[1],ymm7[3],ymm3[2]
2933 ; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm1[0,1],ymm7[2,3]
2934 ; AVX-NEXT: vmovapd 192(%rdi), %ymm1
2935 ; AVX-NEXT: vshufpd {{.*#+}} ymm10 = ymm10[0],ymm1[1],ymm10[3],ymm1[2]
2936 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm15
2937 ; AVX-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
2938 ; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm15[0,1],ymm10[2,3]
2939 ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
2940 ; AVX-NEXT: # ymm2 = mem[0,1,2],ymm2[3]
2941 ; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
2942 ; AVX-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5,6,7]
2943 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3]
2944 ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
2945 ; AVX-NEXT: # ymm4 = mem[0,1,2],ymm4[3]
2946 ; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2947 ; AVX-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
2948 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3]
2949 ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
2950 ; AVX-NEXT: # ymm1 = mem[0,1,2],ymm1[3]
2951 ; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2952 ; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1]
2953 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3]
2954 ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
2955 ; AVX-NEXT: # ymm3 = mem[0,1,2],ymm3[3]
2956 ; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2957 ; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1]
2958 ; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3]
2959 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2960 ; AVX-NEXT: vmovaps %ymm4, 64(%rsi)
2961 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2962 ; AVX-NEXT: vmovaps %ymm4, (%rsi)
2963 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2964 ; AVX-NEXT: vmovaps %ymm4, 96(%rsi)
2965 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2966 ; AVX-NEXT: vmovaps %ymm4, 32(%rsi)
2967 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2968 ; AVX-NEXT: vmovaps %ymm4, 64(%rdx)
2969 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2970 ; AVX-NEXT: vmovaps %ymm4, (%rdx)
2971 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2972 ; AVX-NEXT: vmovaps %ymm4, 96(%rdx)
2973 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2974 ; AVX-NEXT: vmovaps %ymm4, 32(%rdx)
2975 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2976 ; AVX-NEXT: vmovaps %ymm4, 64(%rcx)
2977 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2978 ; AVX-NEXT: vmovaps %ymm4, (%rcx)
2979 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2980 ; AVX-NEXT: vmovaps %ymm4, 96(%rcx)
2981 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2982 ; AVX-NEXT: vmovaps %ymm4, 32(%rcx)
2983 ; AVX-NEXT: vmovapd %ymm13, 64(%r8)
2984 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2985 ; AVX-NEXT: vmovaps %ymm4, (%r8)
2986 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2987 ; AVX-NEXT: vmovaps %ymm4, 32(%r8)
2988 ; AVX-NEXT: vmovups (%rsp), %ymm4 # 32-byte Reload
2989 ; AVX-NEXT: vmovaps %ymm4, 96(%r8)
2990 ; AVX-NEXT: vmovapd %ymm11, (%r9)
2991 ; AVX-NEXT: vmovapd %ymm9, 64(%r9)
2992 ; AVX-NEXT: vmovapd %ymm12, 96(%r9)
2993 ; AVX-NEXT: vmovapd %ymm14, 32(%r9)
2994 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
2995 ; AVX-NEXT: vmovapd %ymm10, (%rax)
2996 ; AVX-NEXT: vmovapd %ymm7, 64(%rax)
2997 ; AVX-NEXT: vmovapd %ymm5, 96(%rax)
2998 ; AVX-NEXT: vmovapd %ymm8, 32(%rax)
2999 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
3000 ; AVX-NEXT: vmovapd %ymm3, 64(%rax)
3001 ; AVX-NEXT: vmovapd %ymm1, (%rax)
3002 ; AVX-NEXT: vmovapd %ymm0, 96(%rax)
3003 ; AVX-NEXT: vmovapd %ymm2, 32(%rax)
3004 ; AVX-NEXT: addq $552, %rsp # imm = 0x228
3005 ; AVX-NEXT: vzeroupper
3008 ; AVX2-LABEL: load_i64_stride7_vf16:
3010 ; AVX2-NEXT: subq $520, %rsp # imm = 0x208
3011 ; AVX2-NEXT: vmovdqa 544(%rdi), %ymm0
3012 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm1
3013 ; AVX2-NEXT: vmovdqa 768(%rdi), %ymm2
3014 ; AVX2-NEXT: vmovdqa 320(%rdi), %ymm3
3015 ; AVX2-NEXT: vmovdqa 384(%rdi), %xmm4
3016 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3017 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3018 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm4[6,7]
3019 ; AVX2-NEXT: vmovdqa 224(%rdi), %xmm5
3020 ; AVX2-NEXT: vmovdqa 272(%rdi), %xmm6
3021 ; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3022 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1],xmm6[2,3]
3023 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
3024 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3025 ; AVX2-NEXT: vmovdqa 832(%rdi), %xmm4
3026 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3027 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3028 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm4[6,7]
3029 ; AVX2-NEXT: vmovdqa 672(%rdi), %xmm6
3030 ; AVX2-NEXT: vmovdqa 720(%rdi), %xmm13
3031 ; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm13[2,3]
3032 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
3033 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3034 ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm4
3035 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3036 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3037 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7]
3038 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm11
3039 ; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = mem[0,1],xmm11[2,3]
3040 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
3041 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3042 ; AVX2-NEXT: vmovdqa 608(%rdi), %xmm4
3043 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3044 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3045 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm4[6,7]
3046 ; AVX2-NEXT: vmovdqa 448(%rdi), %xmm7
3047 ; AVX2-NEXT: vmovdqa 496(%rdi), %xmm8
3048 ; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3049 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm8[2,3]
3050 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
3051 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3052 ; AVX2-NEXT: vmovdqa 288(%rdi), %xmm4
3053 ; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = xmm5[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
3054 ; AVX2-NEXT: vmovdqa 384(%rdi), %ymm5
3055 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
3056 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
3057 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3058 ; AVX2-NEXT: vmovdqa 736(%rdi), %xmm3
3059 ; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = xmm6[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
3060 ; AVX2-NEXT: vmovdqa 832(%rdi), %ymm4
3061 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
3062 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
3063 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3064 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm2
3065 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
3066 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm3
3067 ; AVX2-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
3068 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
3069 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3070 ; AVX2-NEXT: vmovdqa 608(%rdi), %ymm1
3071 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
3072 ; AVX2-NEXT: vmovdqa 512(%rdi), %xmm6
3073 ; AVX2-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
3074 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
3075 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3076 ; AVX2-NEXT: vpbroadcastq 352(%rdi), %ymm0
3077 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3]
3078 ; AVX2-NEXT: vmovdqa 240(%rdi), %xmm5
3079 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
3080 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
3081 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3082 ; AVX2-NEXT: vpbroadcastq 800(%rdi), %ymm0
3083 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3]
3084 ; AVX2-NEXT: vmovdqa 688(%rdi), %xmm4
3085 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
3086 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
3087 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3088 ; AVX2-NEXT: vpbroadcastq 128(%rdi), %ymm0
3089 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
3090 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm3[2,3]
3091 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3092 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3093 ; AVX2-NEXT: vpbroadcastq 576(%rdi), %ymm0
3094 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
3095 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm6[2,3]
3096 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3097 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3098 ; AVX2-NEXT: vmovdqa 736(%rdi), %ymm0
3099 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
3100 ; AVX2-NEXT: vmovdqa 864(%rdi), %xmm1
3101 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
3102 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
3103 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
3104 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3105 ; AVX2-NEXT: vmovdqa 288(%rdi), %ymm0
3106 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
3107 ; AVX2-NEXT: vmovdqa 416(%rdi), %xmm2
3108 ; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
3109 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
3110 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
3111 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3112 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0
3113 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
3114 ; AVX2-NEXT: vmovdqa 192(%rdi), %xmm10
3115 ; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
3116 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3117 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3]
3118 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3119 ; AVX2-NEXT: vmovdqa 512(%rdi), %ymm0
3120 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
3121 ; AVX2-NEXT: vmovdqa 640(%rdi), %xmm4
3122 ; AVX2-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
3123 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
3124 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3]
3125 ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
3126 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
3127 ; AVX2-NEXT: vmovdqa 352(%rdi), %ymm6
3128 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
3129 ; AVX2-NEXT: vmovdqa 256(%rdi), %xmm8
3130 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1],mem[2,3]
3131 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
3132 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3133 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3134 ; AVX2-NEXT: vmovdqa 800(%rdi), %ymm3
3135 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
3136 ; AVX2-NEXT: vmovdqa 704(%rdi), %xmm7
3137 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],mem[2,3]
3138 ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm0[4,5,6,7]
3139 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
3140 ; AVX2-NEXT: vmovdqa 576(%rdi), %ymm9
3141 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
3142 ; AVX2-NEXT: vmovdqa 480(%rdi), %xmm1
3143 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],mem[2,3]
3144 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm0[4,5,6,7]
3145 ; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0
3146 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm10
3147 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7]
3148 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm5
3149 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
3150 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
3151 ; AVX2-NEXT: vmovdqa 416(%rdi), %ymm4
3152 ; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
3153 ; AVX2-NEXT: vmovdqa 320(%rdi), %xmm2
3154 ; AVX2-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
3155 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
3156 ; AVX2-NEXT: vmovdqa 864(%rdi), %ymm8
3157 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
3158 ; AVX2-NEXT: vmovdqa 768(%rdi), %xmm0
3159 ; AVX2-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
3160 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm3[4,5,6,7]
3161 ; AVX2-NEXT: vmovdqa 544(%rdi), %xmm3
3162 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
3163 ; AVX2-NEXT: vmovdqa 640(%rdi), %ymm3
3164 ; AVX2-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
3165 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
3166 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm9
3167 ; AVX2-NEXT: vpalignr {{.*#+}} ymm10 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
3168 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm15
3169 ; AVX2-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
3170 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
3171 ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload
3172 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm4[1],ymm15[3],ymm4[3]
3173 ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
3174 ; AVX2-NEXT: # xmm2 = mem[0,1],xmm2[2,3]
3175 ; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm4[4,5,6,7]
3176 ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload
3177 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3]
3178 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3]
3179 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm2[4,5,6,7]
3180 ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
3181 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3]
3182 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm11[0,1],mem[2,3]
3183 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3184 ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
3185 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
3186 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3187 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
3188 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3189 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3190 ; AVX2-NEXT: vmovaps %ymm2, 64(%rsi)
3191 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3192 ; AVX2-NEXT: vmovaps %ymm2, (%rsi)
3193 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3194 ; AVX2-NEXT: vmovaps %ymm2, 96(%rsi)
3195 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3196 ; AVX2-NEXT: vmovaps %ymm2, 32(%rsi)
3197 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3198 ; AVX2-NEXT: vmovaps %ymm2, 64(%rdx)
3199 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3200 ; AVX2-NEXT: vmovaps %ymm2, (%rdx)
3201 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3202 ; AVX2-NEXT: vmovaps %ymm2, 96(%rdx)
3203 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3204 ; AVX2-NEXT: vmovaps %ymm2, 32(%rdx)
3205 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3206 ; AVX2-NEXT: vmovaps %ymm2, 64(%rcx)
3207 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3208 ; AVX2-NEXT: vmovaps %ymm2, (%rcx)
3209 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3210 ; AVX2-NEXT: vmovaps %ymm2, 96(%rcx)
3211 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3212 ; AVX2-NEXT: vmovaps %ymm2, 32(%rcx)
3213 ; AVX2-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
3214 ; AVX2-NEXT: vmovaps %ymm2, 64(%r8)
3215 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3216 ; AVX2-NEXT: vmovaps %ymm2, (%r8)
3217 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3218 ; AVX2-NEXT: vmovaps %ymm2, 32(%r8)
3219 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3220 ; AVX2-NEXT: vmovaps %ymm2, 96(%r8)
3221 ; AVX2-NEXT: vmovdqa %ymm5, (%r9)
3222 ; AVX2-NEXT: vmovdqa %ymm12, 64(%r9)
3223 ; AVX2-NEXT: vmovdqa %ymm14, 96(%r9)
3224 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3225 ; AVX2-NEXT: vmovaps %ymm2, 32(%r9)
3226 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
3227 ; AVX2-NEXT: vmovdqa %ymm10, (%rax)
3228 ; AVX2-NEXT: vmovdqa %ymm1, 64(%rax)
3229 ; AVX2-NEXT: vmovdqa %ymm7, 96(%rax)
3230 ; AVX2-NEXT: vmovdqa %ymm6, 32(%rax)
3231 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
3232 ; AVX2-NEXT: vmovdqa %ymm0, 64(%rax)
3233 ; AVX2-NEXT: vmovdqa %ymm11, (%rax)
3234 ; AVX2-NEXT: vmovdqa %ymm13, 96(%rax)
3235 ; AVX2-NEXT: vmovdqa %ymm15, 32(%rax)
3236 ; AVX2-NEXT: addq $520, %rsp # imm = 0x208
3237 ; AVX2-NEXT: vzeroupper
3240 ; AVX2-FP-LABEL: load_i64_stride7_vf16:
3242 ; AVX2-FP-NEXT: subq $520, %rsp # imm = 0x208
3243 ; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm0
3244 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm1
3245 ; AVX2-FP-NEXT: vmovdqa 768(%rdi), %ymm2
3246 ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm3
3247 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %xmm4
3248 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3249 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3250 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm4[6,7]
3251 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %xmm5
3252 ; AVX2-FP-NEXT: vmovdqa 272(%rdi), %xmm6
3253 ; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3254 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1],xmm6[2,3]
3255 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
3256 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3257 ; AVX2-FP-NEXT: vmovdqa 832(%rdi), %xmm4
3258 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3259 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3260 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm4[6,7]
3261 ; AVX2-FP-NEXT: vmovdqa 672(%rdi), %xmm6
3262 ; AVX2-FP-NEXT: vmovdqa 720(%rdi), %xmm13
3263 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm13[2,3]
3264 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
3265 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3266 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm4
3267 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3268 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3269 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7]
3270 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm11
3271 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = mem[0,1],xmm11[2,3]
3272 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
3273 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3274 ; AVX2-FP-NEXT: vmovdqa 608(%rdi), %xmm4
3275 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3276 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3277 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm4[6,7]
3278 ; AVX2-FP-NEXT: vmovdqa 448(%rdi), %xmm7
3279 ; AVX2-FP-NEXT: vmovdqa 496(%rdi), %xmm8
3280 ; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3281 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm8[2,3]
3282 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
3283 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3284 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm4
3285 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm4 = xmm5[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
3286 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm5
3287 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
3288 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
3289 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3290 ; AVX2-FP-NEXT: vmovdqa 736(%rdi), %xmm3
3291 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm3 = xmm6[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
3292 ; AVX2-FP-NEXT: vmovdqa 832(%rdi), %ymm4
3293 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
3294 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
3295 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3296 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm2
3297 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
3298 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm3
3299 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
3300 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
3301 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3302 ; AVX2-FP-NEXT: vmovdqa 608(%rdi), %ymm1
3303 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
3304 ; AVX2-FP-NEXT: vmovdqa 512(%rdi), %xmm6
3305 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
3306 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
3307 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3308 ; AVX2-FP-NEXT: vpbroadcastq 352(%rdi), %ymm0
3309 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3]
3310 ; AVX2-FP-NEXT: vmovdqa 240(%rdi), %xmm5
3311 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
3312 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
3313 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3314 ; AVX2-FP-NEXT: vpbroadcastq 800(%rdi), %ymm0
3315 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3]
3316 ; AVX2-FP-NEXT: vmovdqa 688(%rdi), %xmm4
3317 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
3318 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
3319 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3320 ; AVX2-FP-NEXT: vpbroadcastq 128(%rdi), %ymm0
3321 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
3322 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm3[2,3]
3323 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3324 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3325 ; AVX2-FP-NEXT: vpbroadcastq 576(%rdi), %ymm0
3326 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
3327 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm6[2,3]
3328 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3329 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3330 ; AVX2-FP-NEXT: vmovdqa 736(%rdi), %ymm0
3331 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
3332 ; AVX2-FP-NEXT: vmovdqa 864(%rdi), %xmm1
3333 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
3334 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
3335 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
3336 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3337 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm0
3338 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
3339 ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %xmm2
3340 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
3341 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
3342 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
3343 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3344 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0
3345 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
3346 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm10
3347 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
3348 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3349 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3]
3350 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3351 ; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm0
3352 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
3353 ; AVX2-FP-NEXT: vmovdqa 640(%rdi), %xmm4
3354 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
3355 ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
3356 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3]
3357 ; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
3358 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
3359 ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm6
3360 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
3361 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm8
3362 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1],mem[2,3]
3363 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
3364 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3365 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3366 ; AVX2-FP-NEXT: vmovdqa 800(%rdi), %ymm3
3367 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
3368 ; AVX2-FP-NEXT: vmovdqa 704(%rdi), %xmm7
3369 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],mem[2,3]
3370 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm0[4,5,6,7]
3371 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
3372 ; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm9
3373 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
3374 ; AVX2-FP-NEXT: vmovdqa 480(%rdi), %xmm1
3375 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],mem[2,3]
3376 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm0[4,5,6,7]
3377 ; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0
3378 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm10
3379 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7]
3380 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm5
3381 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
3382 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
3383 ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm4
3384 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
3385 ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm2
3386 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
3387 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
3388 ; AVX2-FP-NEXT: vmovdqa 864(%rdi), %ymm8
3389 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
3390 ; AVX2-FP-NEXT: vmovdqa 768(%rdi), %xmm0
3391 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
3392 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm3[4,5,6,7]
3393 ; AVX2-FP-NEXT: vmovdqa 544(%rdi), %xmm3
3394 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
3395 ; AVX2-FP-NEXT: vmovdqa 640(%rdi), %ymm3
3396 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
3397 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
3398 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm9
3399 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm10 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
3400 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm15
3401 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
3402 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
3403 ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload
3404 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm4[1],ymm15[3],ymm4[3]
3405 ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
3406 ; AVX2-FP-NEXT: # xmm2 = mem[0,1],xmm2[2,3]
3407 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm4[4,5,6,7]
3408 ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload
3409 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3]
3410 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3]
3411 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm2[4,5,6,7]
3412 ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
3413 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3]
3414 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm11[0,1],mem[2,3]
3415 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3416 ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
3417 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
3418 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3419 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
3420 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3421 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3422 ; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rsi)
3423 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3424 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rsi)
3425 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3426 ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rsi)
3427 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3428 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rsi)
3429 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3430 ; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rdx)
3431 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3432 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rdx)
3433 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3434 ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rdx)
3435 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3436 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rdx)
3437 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3438 ; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rcx)
3439 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3440 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rcx)
3441 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3442 ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rcx)
3443 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3444 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rcx)
3445 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
3446 ; AVX2-FP-NEXT: vmovaps %ymm2, 64(%r8)
3447 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3448 ; AVX2-FP-NEXT: vmovaps %ymm2, (%r8)
3449 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3450 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%r8)
3451 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3452 ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%r8)
3453 ; AVX2-FP-NEXT: vmovdqa %ymm5, (%r9)
3454 ; AVX2-FP-NEXT: vmovdqa %ymm12, 64(%r9)
3455 ; AVX2-FP-NEXT: vmovdqa %ymm14, 96(%r9)
3456 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3457 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%r9)
3458 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3459 ; AVX2-FP-NEXT: vmovdqa %ymm10, (%rax)
3460 ; AVX2-FP-NEXT: vmovdqa %ymm1, 64(%rax)
3461 ; AVX2-FP-NEXT: vmovdqa %ymm7, 96(%rax)
3462 ; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%rax)
3463 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3464 ; AVX2-FP-NEXT: vmovdqa %ymm0, 64(%rax)
3465 ; AVX2-FP-NEXT: vmovdqa %ymm11, (%rax)
3466 ; AVX2-FP-NEXT: vmovdqa %ymm13, 96(%rax)
3467 ; AVX2-FP-NEXT: vmovdqa %ymm15, 32(%rax)
3468 ; AVX2-FP-NEXT: addq $520, %rsp # imm = 0x208
3469 ; AVX2-FP-NEXT: vzeroupper
3470 ; AVX2-FP-NEXT: retq
3472 ; AVX2-FCP-LABEL: load_i64_stride7_vf16:
3473 ; AVX2-FCP: # %bb.0:
3474 ; AVX2-FCP-NEXT: subq $520, %rsp # imm = 0x208
3475 ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm0
3476 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm1
3477 ; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm2
3478 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm3
3479 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %xmm4
3480 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3481 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3482 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5],ymm4[6,7]
3483 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %xmm5
3484 ; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm6
3485 ; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3486 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1],xmm6[2,3]
3487 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5,6,7]
3488 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3489 ; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %xmm4
3490 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3491 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3492 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm2[0,1,2,3,4,5],ymm4[6,7]
3493 ; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %xmm6
3494 ; AVX2-FCP-NEXT: vmovdqa 720(%rdi), %xmm13
3495 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm13[2,3]
3496 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
3497 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3498 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm4
3499 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3500 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3501 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm4[6,7]
3502 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm11
3503 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = mem[0,1],xmm11[2,3]
3504 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3],ymm4[4,5,6,7]
3505 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3506 ; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %xmm4
3507 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3508 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3509 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm0[0,1,2,3,4,5],ymm4[6,7]
3510 ; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %xmm7
3511 ; AVX2-FCP-NEXT: vmovdqa 496(%rdi), %xmm8
3512 ; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3513 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm7[0,1],xmm8[2,3]
3514 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
3515 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3516 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm4
3517 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm4 = xmm5[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
3518 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm5
3519 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
3520 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
3521 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3522 ; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %xmm3
3523 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm3 = xmm6[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
3524 ; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %ymm4
3525 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
3526 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
3527 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3528 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm2
3529 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
3530 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm3
3531 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
3532 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
3533 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3534 ; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm1
3535 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm0[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
3536 ; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %xmm6
3537 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
3538 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
3539 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3540 ; AVX2-FCP-NEXT: vpbroadcastq 352(%rdi), %ymm0
3541 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm5[1],ymm0[3],ymm5[3]
3542 ; AVX2-FCP-NEXT: vmovdqa 240(%rdi), %xmm5
3543 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
3544 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
3545 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3546 ; AVX2-FCP-NEXT: vpbroadcastq 800(%rdi), %ymm0
3547 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3]
3548 ; AVX2-FCP-NEXT: vmovdqa 688(%rdi), %xmm4
3549 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
3550 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
3551 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3552 ; AVX2-FCP-NEXT: vpbroadcastq 128(%rdi), %ymm0
3553 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
3554 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm3[2,3]
3555 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3556 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3557 ; AVX2-FCP-NEXT: vpbroadcastq 576(%rdi), %ymm0
3558 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
3559 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm6[2,3]
3560 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3561 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3562 ; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %ymm0
3563 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
3564 ; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %xmm1
3565 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
3566 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
3567 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
3568 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3569 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm0
3570 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
3571 ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %xmm2
3572 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
3573 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
3574 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm3[2,3]
3575 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3576 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0
3577 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
3578 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm10
3579 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
3580 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3581 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm4[2,3]
3582 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3583 ; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm0
3584 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
3585 ; AVX2-FCP-NEXT: vmovdqa 640(%rdi), %xmm4
3586 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
3587 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
3588 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm5[2,3]
3589 ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
3590 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
3591 ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm6
3592 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
3593 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm8
3594 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm8[0,1],mem[2,3]
3595 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
3596 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3597 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
3598 ; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %ymm3
3599 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
3600 ; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %xmm7
3601 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm7[0,1],mem[2,3]
3602 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm5[0,1,2,3],ymm0[4,5,6,7]
3603 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
3604 ; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm9
3605 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
3606 ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %xmm1
3607 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],mem[2,3]
3608 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm5[0,1,2,3],ymm0[4,5,6,7]
3609 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm0
3610 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm10
3611 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7]
3612 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm5
3613 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
3614 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
3615 ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm4
3616 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm6[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
3617 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm2
3618 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm8 = xmm8[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
3619 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
3620 ; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %ymm8
3621 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
3622 ; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %xmm0
3623 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
3624 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm3[4,5,6,7]
3625 ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %xmm3
3626 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
3627 ; AVX2-FCP-NEXT: vmovdqa 640(%rdi), %ymm3
3628 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm9[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
3629 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
3630 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm9
3631 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm10 = ymm10[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
3632 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm15
3633 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
3634 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
3635 ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 16-byte Folded Reload
3636 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm15[1],ymm4[1],ymm15[3],ymm4[3]
3637 ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
3638 ; AVX2-FCP-NEXT: # xmm2 = mem[0,1],xmm2[2,3]
3639 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm4[4,5,6,7]
3640 ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 16-byte Folded Reload
3641 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3]
3642 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm13[0,1],xmm0[2,3]
3643 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm2[4,5,6,7]
3644 ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
3645 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3]
3646 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm11[0,1],mem[2,3]
3647 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3648 ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
3649 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
3650 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3651 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
3652 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3653 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3654 ; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rsi)
3655 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3656 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi)
3657 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3658 ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rsi)
3659 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3660 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rsi)
3661 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3662 ; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rdx)
3663 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3664 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rdx)
3665 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3666 ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rdx)
3667 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3668 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rdx)
3669 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3670 ; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rcx)
3671 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3672 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rcx)
3673 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3674 ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rcx)
3675 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3676 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rcx)
3677 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
3678 ; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%r8)
3679 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3680 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%r8)
3681 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3682 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%r8)
3683 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3684 ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%r8)
3685 ; AVX2-FCP-NEXT: vmovdqa %ymm5, (%r9)
3686 ; AVX2-FCP-NEXT: vmovdqa %ymm12, 64(%r9)
3687 ; AVX2-FCP-NEXT: vmovdqa %ymm14, 96(%r9)
3688 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
3689 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%r9)
3690 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3691 ; AVX2-FCP-NEXT: vmovdqa %ymm10, (%rax)
3692 ; AVX2-FCP-NEXT: vmovdqa %ymm1, 64(%rax)
3693 ; AVX2-FCP-NEXT: vmovdqa %ymm7, 96(%rax)
3694 ; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%rax)
3695 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3696 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%rax)
3697 ; AVX2-FCP-NEXT: vmovdqa %ymm11, (%rax)
3698 ; AVX2-FCP-NEXT: vmovdqa %ymm13, 96(%rax)
3699 ; AVX2-FCP-NEXT: vmovdqa %ymm15, 32(%rax)
3700 ; AVX2-FCP-NEXT: addq $520, %rsp # imm = 0x208
3701 ; AVX2-FCP-NEXT: vzeroupper
3702 ; AVX2-FCP-NEXT: retq
3704 ; AVX512-LABEL: load_i64_stride7_vf16:
3706 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
3707 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
3708 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm30
3709 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm6
3710 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm0
3711 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm26
3712 ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm3
3713 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm27
3714 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm2
3715 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm13
3716 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm15
3717 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm8
3718 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm29
3719 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm9
3720 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm28
3721 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1
3722 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0]
3723 ; AVX512-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
3724 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm31
3725 ; AVX512-NEXT: vpermt2q %zmm28, %zmm18, %zmm31
3726 ; AVX512-NEXT: movb $24, %r11b
3727 ; AVX512-NEXT: kmovw %r11d, %k2
3728 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm29[4,5,4,5]
3729 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3]
3730 ; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3731 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm14
3732 ; AVX512-NEXT: vpermt2q %zmm15, %zmm16, %zmm14
3733 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9]
3734 ; AVX512-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
3735 ; AVX512-NEXT: vpermt2q %zmm13, %zmm17, %zmm14
3736 ; AVX512-NEXT: movb $-32, %r11b
3737 ; AVX512-NEXT: kmovw %r11d, %k1
3738 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1}
3739 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm14
3740 ; AVX512-NEXT: vpermt2q %zmm27, %zmm18, %zmm14
3741 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm3[4,5,4,5],zmm26[4,5,4,5]
3742 ; AVX512-NEXT: vpermi2q %zmm6, %zmm0, %zmm16
3743 ; AVX512-NEXT: vpermt2q %zmm30, %zmm17, %zmm16
3744 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1}
3745 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6]
3746 ; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3747 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm17
3748 ; AVX512-NEXT: vpermt2q %zmm29, %zmm24, %zmm17
3749 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
3750 ; AVX512-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
3751 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm16
3752 ; AVX512-NEXT: vpermt2q %zmm1, %zmm20, %zmm16
3753 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2}
3754 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4]
3755 ; AVX512-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3756 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm17
3757 ; AVX512-NEXT: vpermt2q %zmm15, %zmm19, %zmm17
3758 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10]
3759 ; AVX512-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
3760 ; AVX512-NEXT: vpermt2q %zmm13, %zmm21, %zmm17
3761 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1}
3762 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm22
3763 ; AVX512-NEXT: vpermt2q %zmm26, %zmm24, %zmm22
3764 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm17
3765 ; AVX512-NEXT: vpermt2q %zmm2, %zmm20, %zmm17
3766 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2}
3767 ; AVX512-NEXT: vpermi2q %zmm6, %zmm0, %zmm19
3768 ; AVX512-NEXT: vpermt2q %zmm30, %zmm21, %zmm19
3769 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1}
3770 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
3771 ; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3772 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm21
3773 ; AVX512-NEXT: vpermt2q %zmm15, %zmm22, %zmm21
3774 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11]
3775 ; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
3776 ; AVX512-NEXT: vpermt2q %zmm13, %zmm23, %zmm21
3777 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7]
3778 ; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
3779 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm19
3780 ; AVX512-NEXT: vpermt2q %zmm29, %zmm25, %zmm19
3781 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm5
3782 ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
3783 ; AVX512-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm19
3784 ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1}
3785 ; AVX512-NEXT: vpermi2q %zmm6, %zmm0, %zmm22
3786 ; AVX512-NEXT: vpermt2q %zmm30, %zmm23, %zmm22
3787 ; AVX512-NEXT: vpermi2q %zmm26, %zmm3, %zmm25
3788 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3789 ; AVX512-NEXT: vmovdqa 464(%rdi), %xmm5
3790 ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
3791 ; AVX512-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21
3792 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1}
3793 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm5
3794 ; AVX512-NEXT: vpermt2q %zmm15, %zmm24, %zmm5
3795 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12]
3796 ; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
3797 ; AVX512-NEXT: vpermt2q %zmm13, %zmm23, %zmm5
3798 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0]
3799 ; AVX512-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
3800 ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm22
3801 ; AVX512-NEXT: vpermt2q %zmm9, %zmm25, %zmm22
3802 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm4
3803 ; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
3804 ; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4
3805 ; AVX512-NEXT: vinserti32x4 $0, %xmm4, %zmm22, %zmm22
3806 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1}
3807 ; AVX512-NEXT: vpermi2q %zmm6, %zmm0, %zmm24
3808 ; AVX512-NEXT: vpermt2q %zmm30, %zmm23, %zmm24
3809 ; AVX512-NEXT: vpermi2q %zmm3, %zmm26, %zmm25
3810 ; AVX512-NEXT: vmovdqa 512(%rdi), %ymm4
3811 ; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
3812 ; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4
3813 ; AVX512-NEXT: vinserti32x4 $0, %xmm4, %zmm25, %zmm23
3814 ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1}
3815 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm4
3816 ; AVX512-NEXT: vpermt2q %zmm15, %zmm18, %zmm4
3817 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,13,4,5,6,13]
3818 ; AVX512-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
3819 ; AVX512-NEXT: vpermt2q %zmm13, %zmm5, %zmm4
3820 ; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11
3821 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm12
3822 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
3823 ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11]
3824 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10
3825 ; AVX512-NEXT: vpermt2q %zmm28, %zmm3, %zmm10
3826 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
3827 ; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm4, %zmm24
3828 ; AVX512-NEXT: vpermi2q %zmm6, %zmm0, %zmm18
3829 ; AVX512-NEXT: vpermt2q %zmm30, %zmm5, %zmm18
3830 ; AVX512-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm4
3831 ; AVX512-NEXT: vmovdqa 576(%rdi), %ymm5
3832 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
3833 ; AVX512-NEXT: vpermi2q %zmm27, %zmm2, %zmm3
3834 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
3835 ; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm3
3836 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm4
3837 ; AVX512-NEXT: vpermt2q %zmm8, %zmm20, %zmm4
3838 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14]
3839 ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
3840 ; AVX512-NEXT: vpermt2q %zmm13, %zmm10, %zmm4
3841 ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm11
3842 ; AVX512-NEXT: vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
3843 ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,12]
3844 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm7
3845 ; AVX512-NEXT: vpermt2q %zmm28, %zmm12, %zmm7
3846 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7]
3847 ; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm4
3848 ; AVX512-NEXT: vpermi2q %zmm0, %zmm6, %zmm20
3849 ; AVX512-NEXT: vpermt2q %zmm30, %zmm10, %zmm20
3850 ; AVX512-NEXT: vmovdqa 640(%rdi), %ymm7
3851 ; AVX512-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
3852 ; AVX512-NEXT: vpermi2q %zmm27, %zmm2, %zmm12
3853 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7]
3854 ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm5
3855 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9]
3856 ; AVX512-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3857 ; AVX512-NEXT: vpermt2q %zmm15, %zmm7, %zmm8
3858 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15]
3859 ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
3860 ; AVX512-NEXT: vpermt2q %zmm13, %zmm10, %zmm8
3861 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11]
3862 ; AVX512-NEXT: vpermt2q %zmm29, %zmm11, %zmm9
3863 ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
3864 ; AVX512-NEXT: vpermt2q %zmm28, %zmm12, %zmm1
3865 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
3866 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
3867 ; AVX512-NEXT: vpermt2q %zmm6, %zmm7, %zmm0
3868 ; AVX512-NEXT: vpermt2q %zmm30, %zmm10, %zmm0
3869 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
3870 ; AVX512-NEXT: vpermt2q %zmm26, %zmm11, %zmm6
3871 ; AVX512-NEXT: vpermt2q %zmm27, %zmm12, %zmm2
3872 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
3873 ; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0
3874 ; AVX512-NEXT: vmovdqa64 %zmm14, 64(%rsi)
3875 ; AVX512-NEXT: vmovdqa64 %zmm31, (%rsi)
3876 ; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rdx)
3877 ; AVX512-NEXT: vmovdqa64 %zmm16, (%rdx)
3878 ; AVX512-NEXT: vmovdqa64 %zmm21, 64(%rcx)
3879 ; AVX512-NEXT: vmovdqa64 %zmm19, (%rcx)
3880 ; AVX512-NEXT: vmovdqa64 %zmm23, 64(%r8)
3881 ; AVX512-NEXT: vmovdqa64 %zmm22, (%r8)
3882 ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%r9)
3883 ; AVX512-NEXT: vmovdqa64 %zmm24, (%r9)
3884 ; AVX512-NEXT: vmovdqa64 %zmm5, 64(%r10)
3885 ; AVX512-NEXT: vmovdqa64 %zmm4, (%r10)
3886 ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax)
3887 ; AVX512-NEXT: vmovdqa64 %zmm1, (%rax)
3888 ; AVX512-NEXT: vzeroupper
3891 ; AVX512-FCP-LABEL: load_i64_stride7_vf16:
3892 ; AVX512-FCP: # %bb.0:
3893 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3894 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
3895 ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm30
3896 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm6
3897 ; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm0
3898 ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm26
3899 ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm3
3900 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm27
3901 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2
3902 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13
3903 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm15
3904 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8
3905 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm29
3906 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9
3907 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm28
3908 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
3909 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0]
3910 ; AVX512-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
3911 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm31
3912 ; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm31
3913 ; AVX512-FCP-NEXT: movb $24, %r11b
3914 ; AVX512-FCP-NEXT: kmovw %r11d, %k2
3915 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm29[4,5,4,5]
3916 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3]
3917 ; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3918 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm14
3919 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm14
3920 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9]
3921 ; AVX512-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
3922 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm14
3923 ; AVX512-FCP-NEXT: movb $-32, %r11b
3924 ; AVX512-FCP-NEXT: kmovw %r11d, %k1
3925 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1}
3926 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm14
3927 ; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm18, %zmm14
3928 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm3[4,5,4,5],zmm26[4,5,4,5]
3929 ; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm16
3930 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm17, %zmm16
3931 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1}
3932 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6]
3933 ; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3934 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm17
3935 ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm24, %zmm17
3936 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
3937 ; AVX512-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
3938 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm16
3939 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm16
3940 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2}
3941 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4]
3942 ; AVX512-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3943 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm17
3944 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm17
3945 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10]
3946 ; AVX512-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
3947 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm17
3948 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1}
3949 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm22
3950 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm22
3951 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm17
3952 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm17
3953 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2}
3954 ; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm19
3955 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm19
3956 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1}
3957 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
3958 ; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3959 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm21
3960 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm22, %zmm21
3961 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11]
3962 ; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
3963 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm21
3964 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7]
3965 ; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
3966 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm19
3967 ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm19
3968 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
3969 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
3970 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm19
3971 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1}
3972 ; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm22
3973 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm22
3974 ; AVX512-FCP-NEXT: vpermi2q %zmm26, %zmm3, %zmm25
3975 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
3976 ; AVX512-FCP-NEXT: vmovdqa 464(%rdi), %xmm5
3977 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
3978 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21
3979 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1}
3980 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm5
3981 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm5
3982 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12]
3983 ; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
3984 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5
3985 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0]
3986 ; AVX512-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
3987 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm22
3988 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm22
3989 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm4
3990 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
3991 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
3992 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm22, %zmm22
3993 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1}
3994 ; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm24
3995 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm24
3996 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm26, %zmm25
3997 ; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm4
3998 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
3999 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
4000 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm25, %zmm23
4001 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1}
4002 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm4
4003 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm4
4004 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,13,4,5,6,13]
4005 ; AVX512-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
4006 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm4
4007 ; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11
4008 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm12
4009 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
4010 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11]
4011 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
4012 ; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm10
4013 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
4014 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm4, %zmm24
4015 ; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm18
4016 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm18
4017 ; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm4
4018 ; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm5
4019 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
4020 ; AVX512-FCP-NEXT: vpermi2q %zmm27, %zmm2, %zmm3
4021 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
4022 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm3
4023 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm4
4024 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm4
4025 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14]
4026 ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
4027 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm4
4028 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm11
4029 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
4030 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,12]
4031 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm7
4032 ; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm7
4033 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7]
4034 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm4
4035 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm20
4036 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm20
4037 ; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm7
4038 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
4039 ; AVX512-FCP-NEXT: vpermi2q %zmm27, %zmm2, %zmm12
4040 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7]
4041 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm5
4042 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9]
4043 ; AVX512-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4044 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm7, %zmm8
4045 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15]
4046 ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
4047 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm8
4048 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11]
4049 ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm9
4050 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
4051 ; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1
4052 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
4053 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
4054 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm0
4055 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm0
4056 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
4057 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm6
4058 ; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm2
4059 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
4060 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0
4061 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi)
4062 ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, (%rsi)
4063 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 64(%rdx)
4064 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, (%rdx)
4065 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 64(%rcx)
4066 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
4067 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 64(%r8)
4068 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, (%r8)
4069 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%r9)
4070 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%r9)
4071 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 64(%r10)
4072 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r10)
4073 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax)
4074 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
4075 ; AVX512-FCP-NEXT: vzeroupper
4076 ; AVX512-FCP-NEXT: retq
4078 ; AVX512DQ-LABEL: load_i64_stride7_vf16:
4079 ; AVX512DQ: # %bb.0:
4080 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
4081 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
4082 ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm30
4083 ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm6
4084 ; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm0
4085 ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm26
4086 ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm3
4087 ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm27
4088 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm2
4089 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm13
4090 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm15
4091 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm8
4092 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm29
4093 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm9
4094 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm28
4095 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm1
4096 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0]
4097 ; AVX512DQ-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
4098 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm31
4099 ; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm18, %zmm31
4100 ; AVX512DQ-NEXT: movb $24, %r11b
4101 ; AVX512DQ-NEXT: kmovw %r11d, %k2
4102 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm29[4,5,4,5]
4103 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3]
4104 ; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4105 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm14
4106 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm16, %zmm14
4107 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9]
4108 ; AVX512DQ-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
4109 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm17, %zmm14
4110 ; AVX512DQ-NEXT: movb $-32, %r11b
4111 ; AVX512DQ-NEXT: kmovw %r11d, %k1
4112 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1}
4113 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm14
4114 ; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm18, %zmm14
4115 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm3[4,5,4,5],zmm26[4,5,4,5]
4116 ; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm0, %zmm16
4117 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm17, %zmm16
4118 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1}
4119 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6]
4120 ; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4121 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm17
4122 ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm24, %zmm17
4123 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
4124 ; AVX512DQ-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
4125 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm16
4126 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm20, %zmm16
4127 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2}
4128 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4]
4129 ; AVX512DQ-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4130 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm17
4131 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm19, %zmm17
4132 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10]
4133 ; AVX512DQ-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
4134 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm21, %zmm17
4135 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1}
4136 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm22
4137 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm24, %zmm22
4138 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm17
4139 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm20, %zmm17
4140 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2}
4141 ; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm0, %zmm19
4142 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm21, %zmm19
4143 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1}
4144 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
4145 ; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4146 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm21
4147 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm22, %zmm21
4148 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11]
4149 ; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
4150 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm23, %zmm21
4151 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7]
4152 ; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
4153 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm19
4154 ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm25, %zmm19
4155 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm5
4156 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
4157 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm19
4158 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1}
4159 ; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm0, %zmm22
4160 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm23, %zmm22
4161 ; AVX512DQ-NEXT: vpermi2q %zmm26, %zmm3, %zmm25
4162 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4163 ; AVX512DQ-NEXT: vmovdqa 464(%rdi), %xmm5
4164 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
4165 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21
4166 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1}
4167 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm5
4168 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm24, %zmm5
4169 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12]
4170 ; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
4171 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm23, %zmm5
4172 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0]
4173 ; AVX512DQ-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
4174 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm22
4175 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm25, %zmm22
4176 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4
4177 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
4178 ; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm4
4179 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm4, %zmm22, %zmm22
4180 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1}
4181 ; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm0, %zmm24
4182 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm23, %zmm24
4183 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm26, %zmm25
4184 ; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm4
4185 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
4186 ; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm4
4187 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm4, %zmm25, %zmm23
4188 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1}
4189 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm4
4190 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm18, %zmm4
4191 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,13,4,5,6,13]
4192 ; AVX512DQ-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
4193 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm5, %zmm4
4194 ; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11
4195 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm12
4196 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
4197 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11]
4198 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10
4199 ; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm3, %zmm10
4200 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
4201 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm4, %zmm24
4202 ; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm0, %zmm18
4203 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm5, %zmm18
4204 ; AVX512DQ-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm4
4205 ; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm5
4206 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
4207 ; AVX512DQ-NEXT: vpermi2q %zmm27, %zmm2, %zmm3
4208 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
4209 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm3
4210 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm4
4211 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm20, %zmm4
4212 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14]
4213 ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
4214 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm10, %zmm4
4215 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm11
4216 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
4217 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,12]
4218 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm7
4219 ; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm12, %zmm7
4220 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7]
4221 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm4
4222 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm6, %zmm20
4223 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm10, %zmm20
4224 ; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm7
4225 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
4226 ; AVX512DQ-NEXT: vpermi2q %zmm27, %zmm2, %zmm12
4227 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7]
4228 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm5
4229 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9]
4230 ; AVX512DQ-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4231 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm7, %zmm8
4232 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15]
4233 ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
4234 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm10, %zmm8
4235 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11]
4236 ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm11, %zmm9
4237 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
4238 ; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm12, %zmm1
4239 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
4240 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
4241 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm7, %zmm0
4242 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm10, %zmm0
4243 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
4244 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm11, %zmm6
4245 ; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm12, %zmm2
4246 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
4247 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0
4248 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, 64(%rsi)
4249 ; AVX512DQ-NEXT: vmovdqa64 %zmm31, (%rsi)
4250 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 64(%rdx)
4251 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, (%rdx)
4252 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, 64(%rcx)
4253 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rcx)
4254 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, 64(%r8)
4255 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, (%r8)
4256 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%r9)
4257 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, (%r9)
4258 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 64(%r10)
4259 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%r10)
4260 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rax)
4261 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax)
4262 ; AVX512DQ-NEXT: vzeroupper
4263 ; AVX512DQ-NEXT: retq
4265 ; AVX512DQ-FCP-LABEL: load_i64_stride7_vf16:
4266 ; AVX512DQ-FCP: # %bb.0:
4267 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4268 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
4269 ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm30
4270 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm6
4271 ; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm0
4272 ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm26
4273 ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm3
4274 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm27
4275 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2
4276 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13
4277 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm15
4278 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8
4279 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm29
4280 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9
4281 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm28
4282 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
4283 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0]
4284 ; AVX512DQ-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
4285 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm31
4286 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm31
4287 ; AVX512DQ-FCP-NEXT: movb $24, %r11b
4288 ; AVX512DQ-FCP-NEXT: kmovw %r11d, %k2
4289 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm29[4,5,4,5]
4290 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3]
4291 ; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4292 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm14
4293 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm14
4294 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9]
4295 ; AVX512DQ-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
4296 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm14
4297 ; AVX512DQ-FCP-NEXT: movb $-32, %r11b
4298 ; AVX512DQ-FCP-NEXT: kmovw %r11d, %k1
4299 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1}
4300 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm14
4301 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm18, %zmm14
4302 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm3[4,5,4,5],zmm26[4,5,4,5]
4303 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm16
4304 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm17, %zmm16
4305 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1}
4306 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6]
4307 ; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4308 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm17
4309 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm24, %zmm17
4310 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
4311 ; AVX512DQ-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
4312 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm16
4313 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm16
4314 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2}
4315 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4]
4316 ; AVX512DQ-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4317 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm17
4318 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm17
4319 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10]
4320 ; AVX512DQ-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
4321 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm17
4322 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1}
4323 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm22
4324 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm22
4325 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm17
4326 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm17
4327 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2}
4328 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm19
4329 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm21, %zmm19
4330 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1}
4331 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
4332 ; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4333 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm21
4334 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm22, %zmm21
4335 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11]
4336 ; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
4337 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm21
4338 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7]
4339 ; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
4340 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm19
4341 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm19
4342 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
4343 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
4344 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm19
4345 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1}
4346 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm22
4347 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm22
4348 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm26, %zmm3, %zmm25
4349 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4350 ; AVX512DQ-FCP-NEXT: vmovdqa 464(%rdi), %xmm5
4351 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
4352 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21
4353 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1}
4354 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm5
4355 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm5
4356 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12]
4357 ; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
4358 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5
4359 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0]
4360 ; AVX512DQ-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
4361 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm22
4362 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm22
4363 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm4
4364 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
4365 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
4366 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm22, %zmm22
4367 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1}
4368 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm24
4369 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm23, %zmm24
4370 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm26, %zmm25
4371 ; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm4
4372 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
4373 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
4374 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm25, %zmm23
4375 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1}
4376 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm4
4377 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm4
4378 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [4,5,6,13,4,5,6,13]
4379 ; AVX512DQ-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
4380 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm4
4381 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11
4382 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm12
4383 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
4384 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11]
4385 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
4386 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm3, %zmm10
4387 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
4388 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm4, %zmm24
4389 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm18
4390 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm5, %zmm18
4391 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm4
4392 ; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm5
4393 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
4394 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm27, %zmm2, %zmm3
4395 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
4396 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm18, %zmm3
4397 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm4
4398 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm4
4399 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14]
4400 ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
4401 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm4
4402 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm11
4403 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm12[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
4404 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [5,12]
4405 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm7
4406 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm7
4407 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7]
4408 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm4, %zmm4
4409 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm20
4410 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm20
4411 ; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm7
4412 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
4413 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm27, %zmm2, %zmm12
4414 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm12[0,1,2,3],ymm5[4,5,6,7]
4415 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm20, %zmm5
4416 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [2,9,2,9,2,9,2,9]
4417 ; AVX512DQ-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4418 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm7, %zmm8
4419 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15]
4420 ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
4421 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm8
4422 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11]
4423 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm9
4424 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
4425 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1
4426 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
4427 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
4428 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm7, %zmm0
4429 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm0
4430 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
4431 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm6
4432 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm12, %zmm2
4433 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
4434 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0
4435 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi)
4436 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, (%rsi)
4437 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 64(%rdx)
4438 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, (%rdx)
4439 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 64(%rcx)
4440 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
4441 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 64(%r8)
4442 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, (%r8)
4443 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%r9)
4444 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%r9)
4445 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 64(%r10)
4446 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r10)
4447 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax)
4448 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
4449 ; AVX512DQ-FCP-NEXT: vzeroupper
4450 ; AVX512DQ-FCP-NEXT: retq
4452 ; AVX512BW-LABEL: load_i64_stride7_vf16:
4453 ; AVX512BW: # %bb.0:
4454 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
4455 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
4456 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm3
4457 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm6
4458 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm0
4459 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm26
4460 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm4
4461 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm28
4462 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2
4463 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm13
4464 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm15
4465 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm8
4466 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm30
4467 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm9
4468 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm29
4469 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
4470 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0]
4471 ; AVX512BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
4472 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm31
4473 ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm18, %zmm31
4474 ; AVX512BW-NEXT: movb $24, %r11b
4475 ; AVX512BW-NEXT: kmovd %r11d, %k2
4476 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm30[4,5,4,5]
4477 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3]
4478 ; AVX512BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4479 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm14
4480 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm14
4481 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9]
4482 ; AVX512BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
4483 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm17, %zmm14
4484 ; AVX512BW-NEXT: movb $-32, %r11b
4485 ; AVX512BW-NEXT: kmovd %r11d, %k1
4486 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1}
4487 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14
4488 ; AVX512BW-NEXT: vpermt2q %zmm28, %zmm18, %zmm14
4489 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[4,5,4,5],zmm26[4,5,4,5]
4490 ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm16
4491 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm17, %zmm16
4492 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1}
4493 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6]
4494 ; AVX512BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4495 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm17
4496 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm24, %zmm17
4497 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
4498 ; AVX512BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
4499 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm16
4500 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm16
4501 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2}
4502 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4]
4503 ; AVX512BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4504 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm17
4505 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm19, %zmm17
4506 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10]
4507 ; AVX512BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
4508 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm17
4509 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1}
4510 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm22
4511 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm24, %zmm22
4512 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm17
4513 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm20, %zmm17
4514 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2}
4515 ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm19
4516 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm19
4517 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1}
4518 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
4519 ; AVX512BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4520 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm21
4521 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm22, %zmm21
4522 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11]
4523 ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
4524 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm21
4525 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7]
4526 ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
4527 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm19
4528 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm25, %zmm19
4529 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm5
4530 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
4531 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm19
4532 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1}
4533 ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm22
4534 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm22
4535 ; AVX512BW-NEXT: vpermi2q %zmm26, %zmm4, %zmm25
4536 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4537 ; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm5
4538 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
4539 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21
4540 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1}
4541 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5
4542 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm24, %zmm5
4543 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12]
4544 ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
4545 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm5
4546 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0]
4547 ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
4548 ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm22
4549 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm22
4550 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm27
4551 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm27 = mem[8,9,10,11,12,13,14,15],ymm27[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm27[16,17,18,19,20,21,22,23]
4552 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm27, %xmm27
4553 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm27, %zmm22, %zmm22
4554 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1}
4555 ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm24
4556 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm24
4557 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm26, %zmm25
4558 ; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm5
4559 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
4560 ; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5
4561 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm23
4562 ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1}
4563 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm5
4564 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm18, %zmm5
4565 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [4,5,6,13,4,5,6,13]
4566 ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
4567 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm5
4568 ; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm4
4569 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm11
4570 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7]
4571 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,11]
4572 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10
4573 ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm10
4574 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7]
4575 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm24
4576 ; AVX512BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm18
4577 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm18
4578 ; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm4
4579 ; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm5
4580 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
4581 ; AVX512BW-NEXT: vpermi2q %zmm28, %zmm2, %zmm12
4582 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7]
4583 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm25
4584 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm10
4585 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm20, %zmm10
4586 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14]
4587 ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
4588 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm10
4589 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %ymm18
4590 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23]
4591 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,12]
4592 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm7
4593 ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm7
4594 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7]
4595 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7
4596 ; AVX512BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm20
4597 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm20
4598 ; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm10
4599 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
4600 ; AVX512BW-NEXT: vpermi2q %zmm28, %zmm2, %zmm4
4601 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
4602 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4
4603 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9]
4604 ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4605 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm8
4606 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15]
4607 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
4608 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm8
4609 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11]
4610 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm11, %zmm9
4611 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
4612 ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm1
4613 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
4614 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
4615 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm0
4616 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm0
4617 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
4618 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3
4619 ; AVX512BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm2
4620 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
4621 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0
4622 ; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rsi)
4623 ; AVX512BW-NEXT: vmovdqa64 %zmm31, (%rsi)
4624 ; AVX512BW-NEXT: vmovdqa64 %zmm17, 64(%rdx)
4625 ; AVX512BW-NEXT: vmovdqa64 %zmm16, (%rdx)
4626 ; AVX512BW-NEXT: vmovdqa64 %zmm21, 64(%rcx)
4627 ; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rcx)
4628 ; AVX512BW-NEXT: vmovdqa64 %zmm23, 64(%r8)
4629 ; AVX512BW-NEXT: vmovdqa64 %zmm22, (%r8)
4630 ; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%r9)
4631 ; AVX512BW-NEXT: vmovdqa64 %zmm24, (%r9)
4632 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r10)
4633 ; AVX512BW-NEXT: vmovdqa64 %zmm7, (%r10)
4634 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax)
4635 ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax)
4636 ; AVX512BW-NEXT: vzeroupper
4637 ; AVX512BW-NEXT: retq
4639 ; AVX512BW-FCP-LABEL: load_i64_stride7_vf16:
4640 ; AVX512BW-FCP: # %bb.0:
4641 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4642 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
4643 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3
4644 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm6
4645 ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm0
4646 ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm26
4647 ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4
4648 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm28
4649 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2
4650 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13
4651 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm15
4652 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8
4653 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30
4654 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9
4655 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29
4656 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
4657 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0]
4658 ; AVX512BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
4659 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm31
4660 ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm18, %zmm31
4661 ; AVX512BW-FCP-NEXT: movb $24, %r11b
4662 ; AVX512BW-FCP-NEXT: kmovd %r11d, %k2
4663 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm30[4,5,4,5]
4664 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3]
4665 ; AVX512BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4666 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14
4667 ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm14
4668 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9]
4669 ; AVX512BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
4670 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm14
4671 ; AVX512BW-FCP-NEXT: movb $-32, %r11b
4672 ; AVX512BW-FCP-NEXT: kmovd %r11d, %k1
4673 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1}
4674 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14
4675 ; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm14
4676 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[4,5,4,5],zmm26[4,5,4,5]
4677 ; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm16
4678 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm16
4679 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1}
4680 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6]
4681 ; AVX512BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4682 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm17
4683 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm24, %zmm17
4684 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
4685 ; AVX512BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
4686 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm16
4687 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm16
4688 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2}
4689 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4]
4690 ; AVX512BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4691 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm17
4692 ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm17
4693 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10]
4694 ; AVX512BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
4695 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm17
4696 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1}
4697 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm22
4698 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm22
4699 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm17
4700 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm17
4701 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2}
4702 ; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm19
4703 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm19
4704 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1}
4705 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
4706 ; AVX512BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4707 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm21
4708 ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm22, %zmm21
4709 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11]
4710 ; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
4711 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm21
4712 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7]
4713 ; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
4714 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm19
4715 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm25, %zmm19
4716 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
4717 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
4718 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm19
4719 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1}
4720 ; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm22
4721 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm22
4722 ; AVX512BW-FCP-NEXT: vpermi2q %zmm26, %zmm4, %zmm25
4723 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4724 ; AVX512BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm5
4725 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
4726 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21
4727 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1}
4728 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5
4729 ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm5
4730 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12]
4731 ; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
4732 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5
4733 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0]
4734 ; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
4735 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm22
4736 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm22
4737 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27
4738 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm27 = mem[8,9,10,11,12,13,14,15],ymm27[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm27[16,17,18,19,20,21,22,23]
4739 ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm27, %xmm27
4740 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm27, %zmm22, %zmm22
4741 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1}
4742 ; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm24
4743 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm24
4744 ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm26, %zmm25
4745 ; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm5
4746 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
4747 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
4748 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm23
4749 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1}
4750 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5
4751 ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm5
4752 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [4,5,6,13,4,5,6,13]
4753 ; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
4754 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm5
4755 ; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm4
4756 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
4757 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7]
4758 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,11]
4759 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
4760 ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm10
4761 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7]
4762 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm24
4763 ; AVX512BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm18
4764 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm18
4765 ; AVX512BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm4
4766 ; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm5
4767 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
4768 ; AVX512BW-FCP-NEXT: vpermi2q %zmm28, %zmm2, %zmm12
4769 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7]
4770 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm25
4771 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10
4772 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm10
4773 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14]
4774 ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
4775 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm10
4776 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18
4777 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23]
4778 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,12]
4779 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7
4780 ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm7
4781 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7]
4782 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7
4783 ; AVX512BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm20
4784 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm20
4785 ; AVX512BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm10
4786 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
4787 ; AVX512BW-FCP-NEXT: vpermi2q %zmm28, %zmm2, %zmm4
4788 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
4789 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4
4790 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9]
4791 ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4792 ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm8
4793 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15]
4794 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
4795 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm8
4796 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11]
4797 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm9
4798 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
4799 ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm1
4800 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
4801 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
4802 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm0
4803 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm0
4804 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
4805 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3
4806 ; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm2
4807 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
4808 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0
4809 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi)
4810 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, (%rsi)
4811 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rdx)
4812 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, (%rdx)
4813 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rcx)
4814 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
4815 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%r8)
4816 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, (%r8)
4817 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9)
4818 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%r9)
4819 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r10)
4820 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%r10)
4821 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax)
4822 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
4823 ; AVX512BW-FCP-NEXT: vzeroupper
4824 ; AVX512BW-FCP-NEXT: retq
4826 ; AVX512DQ-BW-LABEL: load_i64_stride7_vf16:
4827 ; AVX512DQ-BW: # %bb.0:
4828 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
4829 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
4830 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm3
4831 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm6
4832 ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm0
4833 ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm26
4834 ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm4
4835 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm28
4836 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm2
4837 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm13
4838 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm15
4839 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm8
4840 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm30
4841 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm9
4842 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm29
4843 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1
4844 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0]
4845 ; AVX512DQ-BW-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
4846 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm31
4847 ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm18, %zmm31
4848 ; AVX512DQ-BW-NEXT: movb $24, %r11b
4849 ; AVX512DQ-BW-NEXT: kmovd %r11d, %k2
4850 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm30[4,5,4,5]
4851 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3]
4852 ; AVX512DQ-BW-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4853 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm14
4854 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm16, %zmm14
4855 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9]
4856 ; AVX512DQ-BW-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
4857 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm17, %zmm14
4858 ; AVX512DQ-BW-NEXT: movb $-32, %r11b
4859 ; AVX512DQ-BW-NEXT: kmovd %r11d, %k1
4860 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1}
4861 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm14
4862 ; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm18, %zmm14
4863 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[4,5,4,5],zmm26[4,5,4,5]
4864 ; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm16
4865 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm17, %zmm16
4866 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1}
4867 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6]
4868 ; AVX512DQ-BW-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4869 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm17
4870 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm24, %zmm17
4871 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
4872 ; AVX512DQ-BW-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
4873 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm16
4874 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm20, %zmm16
4875 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2}
4876 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4]
4877 ; AVX512DQ-BW-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4878 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm17
4879 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm19, %zmm17
4880 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10]
4881 ; AVX512DQ-BW-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
4882 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm21, %zmm17
4883 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1}
4884 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm22
4885 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm24, %zmm22
4886 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm17
4887 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm20, %zmm17
4888 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2}
4889 ; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm19
4890 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm21, %zmm19
4891 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1}
4892 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
4893 ; AVX512DQ-BW-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4894 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm21
4895 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm22, %zmm21
4896 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11]
4897 ; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
4898 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm21
4899 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7]
4900 ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
4901 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm19
4902 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm25, %zmm19
4903 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm5
4904 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
4905 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm19
4906 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1}
4907 ; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm22
4908 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm22
4909 ; AVX512DQ-BW-NEXT: vpermi2q %zmm26, %zmm4, %zmm25
4910 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
4911 ; AVX512DQ-BW-NEXT: vmovdqa 464(%rdi), %xmm5
4912 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
4913 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21
4914 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1}
4915 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm5
4916 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm24, %zmm5
4917 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12]
4918 ; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
4919 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm5
4920 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0]
4921 ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
4922 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm22
4923 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm25, %zmm22
4924 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm27
4925 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm27 = mem[8,9,10,11,12,13,14,15],ymm27[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm27[16,17,18,19,20,21,22,23]
4926 ; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm27, %xmm27
4927 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm27, %zmm22, %zmm22
4928 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1}
4929 ; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm24
4930 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm23, %zmm24
4931 ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm26, %zmm25
4932 ; AVX512DQ-BW-NEXT: vmovdqa 512(%rdi), %ymm5
4933 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
4934 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm5
4935 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm23
4936 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1}
4937 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm5
4938 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm18, %zmm5
4939 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [4,5,6,13,4,5,6,13]
4940 ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
4941 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm25, %zmm5
4942 ; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm4
4943 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm11
4944 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7]
4945 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,11]
4946 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10
4947 ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm10
4948 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7]
4949 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm24
4950 ; AVX512DQ-BW-NEXT: vpermi2q %zmm6, %zmm0, %zmm18
4951 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm25, %zmm18
4952 ; AVX512DQ-BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm4
4953 ; AVX512DQ-BW-NEXT: vmovdqa 576(%rdi), %ymm5
4954 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
4955 ; AVX512DQ-BW-NEXT: vpermi2q %zmm28, %zmm2, %zmm12
4956 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7]
4957 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm25
4958 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm10
4959 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm20, %zmm10
4960 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14]
4961 ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
4962 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm10
4963 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %ymm18
4964 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23]
4965 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,12]
4966 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm7
4967 ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm7
4968 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7]
4969 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7
4970 ; AVX512DQ-BW-NEXT: vpermi2q %zmm0, %zmm6, %zmm20
4971 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm12, %zmm20
4972 ; AVX512DQ-BW-NEXT: vmovdqa 640(%rdi), %ymm10
4973 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
4974 ; AVX512DQ-BW-NEXT: vpermi2q %zmm28, %zmm2, %zmm4
4975 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
4976 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4
4977 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9]
4978 ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4979 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm5, %zmm8
4980 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15]
4981 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
4982 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm8
4983 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11]
4984 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm11, %zmm9
4985 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
4986 ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm12, %zmm1
4987 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
4988 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
4989 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm5, %zmm0
4990 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm10, %zmm0
4991 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
4992 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3
4993 ; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm2
4994 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
4995 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0
4996 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%rsi)
4997 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, (%rsi)
4998 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 64(%rdx)
4999 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, (%rdx)
5000 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 64(%rcx)
5001 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rcx)
5002 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 64(%r8)
5003 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, (%r8)
5004 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 64(%r9)
5005 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, (%r9)
5006 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%r10)
5007 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%r10)
5008 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax)
5009 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax)
5010 ; AVX512DQ-BW-NEXT: vzeroupper
5011 ; AVX512DQ-BW-NEXT: retq
5013 ; AVX512DQ-BW-FCP-LABEL: load_i64_stride7_vf16:
5014 ; AVX512DQ-BW-FCP: # %bb.0:
5015 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5016 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
5017 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3
5018 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm6
5019 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm0
5020 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm26
5021 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4
5022 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm28
5023 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2
5024 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm13
5025 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm15
5026 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm8
5027 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30
5028 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm9
5029 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29
5030 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
5031 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm18 = [0,7,14,0,0,7,14,0]
5032 ; AVX512DQ-BW-FCP-NEXT: # zmm18 = mem[0,1,2,3,0,1,2,3]
5033 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm31
5034 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm18, %zmm31
5035 ; AVX512DQ-BW-FCP-NEXT: movb $24, %r11b
5036 ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k2
5037 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm9[4,5,4,5],zmm30[4,5,4,5]
5038 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [10,3,10,3,10,3,10,3]
5039 ; AVX512DQ-BW-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5040 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm14
5041 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm16, %zmm14
5042 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm17 = [0,5,6,9,0,5,6,9]
5043 ; AVX512DQ-BW-FCP-NEXT: # zmm17 = mem[0,1,2,3,0,1,2,3]
5044 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm17, %zmm14
5045 ; AVX512DQ-BW-FCP-NEXT: movb $-32, %r11b
5046 ; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1
5047 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm31 {%k1}
5048 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14
5049 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm18, %zmm14
5050 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k2} = zmm4[4,5,4,5],zmm26[4,5,4,5]
5051 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm16
5052 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm17, %zmm16
5053 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm14 {%k1}
5054 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [13,6,13,6,13,6,13,6]
5055 ; AVX512DQ-BW-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5056 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm17
5057 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm24, %zmm17
5058 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm20 = [9,0,7,0,9,0,7,0]
5059 ; AVX512DQ-BW-FCP-NEXT: # zmm20 = mem[0,1,2,3,0,1,2,3]
5060 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm16
5061 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm20, %zmm16
5062 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k2}
5063 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm19 = [11,4,11,4,11,4,11,4]
5064 ; AVX512DQ-BW-FCP-NEXT: # zmm19 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5065 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm17
5066 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm19, %zmm17
5067 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm21 = [0,5,6,10,0,5,6,10]
5068 ; AVX512DQ-BW-FCP-NEXT: # zmm21 = mem[0,1,2,3,0,1,2,3]
5069 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm21, %zmm17
5070 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1}
5071 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm22
5072 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm22
5073 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm17
5074 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm20, %zmm17
5075 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2}
5076 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm19
5077 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm21, %zmm19
5078 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm17 {%k1}
5079 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
5080 ; AVX512DQ-BW-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5081 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm21
5082 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm22, %zmm21
5083 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,11,0,5,6,11]
5084 ; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
5085 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm21
5086 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [14,0,0,7,14,0,0,7]
5087 ; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
5088 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm19
5089 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm25, %zmm19
5090 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm5
5091 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
5092 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm19
5093 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm19 {%k1}
5094 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm22
5095 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm22
5096 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm26, %zmm4, %zmm25
5097 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5098 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm5
5099 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
5100 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm21
5101 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm21 {%k1}
5102 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5
5103 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm24, %zmm5
5104 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm23 = [0,5,6,12,0,5,6,12]
5105 ; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3]
5106 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm5
5107 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [7,0,9,0,7,0,9,0]
5108 ; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
5109 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm22
5110 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm25, %zmm22
5111 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27
5112 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm27 = mem[8,9,10,11,12,13,14,15],ymm27[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm27[16,17,18,19,20,21,22,23]
5113 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm27, %xmm27
5114 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm27, %zmm22, %zmm22
5115 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm22 {%k1}
5116 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm24
5117 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm23, %zmm24
5118 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm26, %zmm25
5119 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm5
5120 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
5121 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
5122 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm25, %zmm23
5123 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm23 {%k1}
5124 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm5
5125 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm18, %zmm5
5126 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [4,5,6,13,4,5,6,13]
5127 ; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
5128 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm25, %zmm5
5129 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm4
5130 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11
5131 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3,4,5],ymm4[6,7]
5132 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [4,11]
5133 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
5134 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm10
5135 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm4[4,5,6,7]
5136 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm5, %zmm24
5137 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm6, %zmm0, %zmm18
5138 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm25, %zmm18
5139 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm4
5140 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm5
5141 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
5142 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm28, %zmm2, %zmm12
5143 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7]
5144 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm18, %zmm25
5145 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10
5146 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm20, %zmm10
5147 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [4,5,6,14,4,5,6,14]
5148 ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
5149 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm10
5150 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm18
5151 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm18[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm18[16,17,18,19,20,21,22,23]
5152 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm4 = [5,12]
5153 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm7
5154 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm7
5155 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm11[4,5,6,7]
5156 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm7
5157 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm0, %zmm6, %zmm20
5158 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm12, %zmm20
5159 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm10
5160 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm5[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
5161 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm28, %zmm2, %zmm4
5162 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
5163 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm20, %zmm4
5164 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [2,9,2,9,2,9,2,9]
5165 ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
5166 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm5, %zmm8
5167 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,8,15,4,5,8,15]
5168 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
5169 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm8
5170 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm11 = [0,0,4,11]
5171 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm9
5172 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm12 = [6,13]
5173 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm12, %zmm1
5174 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
5175 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm8, %zmm1
5176 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm5, %zmm0
5177 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm10, %zmm0
5178 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
5179 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3
5180 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm2
5181 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
5182 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm0, %zmm0
5183 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rsi)
5184 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, (%rsi)
5185 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rdx)
5186 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, (%rdx)
5187 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rcx)
5188 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
5189 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 64(%r8)
5190 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, (%r8)
5191 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%r9)
5192 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%r9)
5193 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r10)
5194 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%r10)
5195 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax)
5196 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
5197 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
5198 ; AVX512DQ-BW-FCP-NEXT: retq
5199 %wide.vec = load <112 x i64>, ptr %in.vec, align 64
5200 %strided.vec0 = shufflevector <112 x i64> %wide.vec, <112 x i64> poison, <16 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105>
5201 %strided.vec1 = shufflevector <112 x i64> %wide.vec, <112 x i64> poison, <16 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106>
5202 %strided.vec2 = shufflevector <112 x i64> %wide.vec, <112 x i64> poison, <16 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107>
5203 %strided.vec3 = shufflevector <112 x i64> %wide.vec, <112 x i64> poison, <16 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108>
5204 %strided.vec4 = shufflevector <112 x i64> %wide.vec, <112 x i64> poison, <16 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109>
5205 %strided.vec5 = shufflevector <112 x i64> %wide.vec, <112 x i64> poison, <16 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110>
5206 %strided.vec6 = shufflevector <112 x i64> %wide.vec, <112 x i64> poison, <16 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111>
5207 store <16 x i64> %strided.vec0, ptr %out.vec0, align 64
5208 store <16 x i64> %strided.vec1, ptr %out.vec1, align 64
5209 store <16 x i64> %strided.vec2, ptr %out.vec2, align 64
5210 store <16 x i64> %strided.vec3, ptr %out.vec3, align 64
5211 store <16 x i64> %strided.vec4, ptr %out.vec4, align 64
5212 store <16 x i64> %strided.vec5, ptr %out.vec5, align 64
5213 store <16 x i64> %strided.vec6, ptr %out.vec6, align 64
5217 define void @load_i64_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
5218 ; SSE-LABEL: load_i64_stride7_vf32:
5220 ; SSE-NEXT: subq $1448, %rsp # imm = 0x5A8
5221 ; SSE-NEXT: movapd 208(%rdi), %xmm3
5222 ; SSE-NEXT: movapd 96(%rdi), %xmm2
5223 ; SSE-NEXT: movapd 144(%rdi), %xmm4
5224 ; SSE-NEXT: movapd 192(%rdi), %xmm6
5225 ; SSE-NEXT: movapd 80(%rdi), %xmm5
5226 ; SSE-NEXT: movapd 128(%rdi), %xmm8
5227 ; SSE-NEXT: movapd 64(%rdi), %xmm10
5228 ; SSE-NEXT: movapd 176(%rdi), %xmm11
5229 ; SSE-NEXT: movapd (%rdi), %xmm12
5230 ; SSE-NEXT: movapd 16(%rdi), %xmm9
5231 ; SSE-NEXT: movapd 32(%rdi), %xmm7
5232 ; SSE-NEXT: movapd 48(%rdi), %xmm0
5233 ; SSE-NEXT: movapd 224(%rdi), %xmm13
5234 ; SSE-NEXT: movapd 112(%rdi), %xmm14
5235 ; SSE-NEXT: movapd 160(%rdi), %xmm1
5236 ; SSE-NEXT: movapd %xmm0, %xmm15
5237 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm12[0],xmm15[1]
5238 ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5239 ; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm10[0]
5240 ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5241 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm9[0],xmm10[1]
5242 ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5243 ; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm5[0]
5244 ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5245 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1]
5246 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5247 ; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm2[0]
5248 ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5249 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
5250 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5251 ; SSE-NEXT: movapd %xmm1, %xmm0
5252 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1]
5253 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5254 ; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm11[0]
5255 ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5256 ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1]
5257 ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5258 ; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm6[0]
5259 ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5260 ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1]
5261 ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5262 ; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm3[0]
5263 ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5264 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
5265 ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5266 ; SSE-NEXT: movapd 272(%rdi), %xmm0
5267 ; SSE-NEXT: movapd %xmm0, %xmm1
5268 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1]
5269 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5270 ; SSE-NEXT: movapd 288(%rdi), %xmm1
5271 ; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm1[0]
5272 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5273 ; SSE-NEXT: movapd 240(%rdi), %xmm2
5274 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5275 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5276 ; SSE-NEXT: movapd 304(%rdi), %xmm1
5277 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5278 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5279 ; SSE-NEXT: movapd 256(%rdi), %xmm2
5280 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5281 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5282 ; SSE-NEXT: movapd 320(%rdi), %xmm1
5283 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5284 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5285 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
5286 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5287 ; SSE-NEXT: movapd 336(%rdi), %xmm2
5288 ; SSE-NEXT: movapd 384(%rdi), %xmm0
5289 ; SSE-NEXT: movapd %xmm0, %xmm1
5290 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5291 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5292 ; SSE-NEXT: movapd 400(%rdi), %xmm1
5293 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5294 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5295 ; SSE-NEXT: movapd 352(%rdi), %xmm2
5296 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5297 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5298 ; SSE-NEXT: movapd 416(%rdi), %xmm1
5299 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5300 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5301 ; SSE-NEXT: movapd 368(%rdi), %xmm2
5302 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5303 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5304 ; SSE-NEXT: movapd 432(%rdi), %xmm1
5305 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5306 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5307 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
5308 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5309 ; SSE-NEXT: movapd 448(%rdi), %xmm2
5310 ; SSE-NEXT: movapd 496(%rdi), %xmm0
5311 ; SSE-NEXT: movapd %xmm0, %xmm1
5312 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5313 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5314 ; SSE-NEXT: movapd 512(%rdi), %xmm1
5315 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5316 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5317 ; SSE-NEXT: movapd 464(%rdi), %xmm2
5318 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5319 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5320 ; SSE-NEXT: movapd 528(%rdi), %xmm1
5321 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5322 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5323 ; SSE-NEXT: movapd 480(%rdi), %xmm2
5324 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5325 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5326 ; SSE-NEXT: movapd 544(%rdi), %xmm1
5327 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5328 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5329 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
5330 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5331 ; SSE-NEXT: movapd 560(%rdi), %xmm2
5332 ; SSE-NEXT: movapd 608(%rdi), %xmm0
5333 ; SSE-NEXT: movapd %xmm0, %xmm1
5334 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5335 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5336 ; SSE-NEXT: movapd 624(%rdi), %xmm1
5337 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5338 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5339 ; SSE-NEXT: movapd 576(%rdi), %xmm2
5340 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5341 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5342 ; SSE-NEXT: movapd 640(%rdi), %xmm1
5343 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5344 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5345 ; SSE-NEXT: movapd 592(%rdi), %xmm2
5346 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5347 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5348 ; SSE-NEXT: movapd 656(%rdi), %xmm1
5349 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5350 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5351 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
5352 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5353 ; SSE-NEXT: movapd 672(%rdi), %xmm2
5354 ; SSE-NEXT: movapd 720(%rdi), %xmm0
5355 ; SSE-NEXT: movapd %xmm0, %xmm1
5356 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5357 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5358 ; SSE-NEXT: movapd 736(%rdi), %xmm1
5359 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5360 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5361 ; SSE-NEXT: movapd 688(%rdi), %xmm2
5362 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5363 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5364 ; SSE-NEXT: movapd 752(%rdi), %xmm1
5365 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5366 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5367 ; SSE-NEXT: movapd 704(%rdi), %xmm2
5368 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5369 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5370 ; SSE-NEXT: movapd 768(%rdi), %xmm1
5371 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5372 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5373 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
5374 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5375 ; SSE-NEXT: movapd 784(%rdi), %xmm2
5376 ; SSE-NEXT: movapd 832(%rdi), %xmm0
5377 ; SSE-NEXT: movapd %xmm0, %xmm1
5378 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5379 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5380 ; SSE-NEXT: movapd 848(%rdi), %xmm1
5381 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5382 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5383 ; SSE-NEXT: movapd 800(%rdi), %xmm2
5384 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5385 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5386 ; SSE-NEXT: movapd 864(%rdi), %xmm1
5387 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5388 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5389 ; SSE-NEXT: movapd 816(%rdi), %xmm2
5390 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5391 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5392 ; SSE-NEXT: movapd 880(%rdi), %xmm1
5393 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5394 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5395 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
5396 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5397 ; SSE-NEXT: movapd 896(%rdi), %xmm2
5398 ; SSE-NEXT: movapd 944(%rdi), %xmm0
5399 ; SSE-NEXT: movapd %xmm0, %xmm1
5400 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5401 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5402 ; SSE-NEXT: movapd 960(%rdi), %xmm1
5403 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5404 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5405 ; SSE-NEXT: movapd 912(%rdi), %xmm2
5406 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5407 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5408 ; SSE-NEXT: movapd 976(%rdi), %xmm1
5409 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5410 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5411 ; SSE-NEXT: movapd 928(%rdi), %xmm2
5412 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5413 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5414 ; SSE-NEXT: movapd 992(%rdi), %xmm1
5415 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5416 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5417 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
5418 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5419 ; SSE-NEXT: movapd 1008(%rdi), %xmm2
5420 ; SSE-NEXT: movapd 1056(%rdi), %xmm0
5421 ; SSE-NEXT: movapd %xmm0, %xmm1
5422 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5423 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5424 ; SSE-NEXT: movapd 1072(%rdi), %xmm1
5425 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5426 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5427 ; SSE-NEXT: movapd 1024(%rdi), %xmm2
5428 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5429 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5430 ; SSE-NEXT: movapd 1088(%rdi), %xmm1
5431 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5432 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5433 ; SSE-NEXT: movapd 1040(%rdi), %xmm2
5434 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5435 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5436 ; SSE-NEXT: movapd 1104(%rdi), %xmm1
5437 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5438 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5439 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
5440 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5441 ; SSE-NEXT: movapd 1120(%rdi), %xmm2
5442 ; SSE-NEXT: movapd 1168(%rdi), %xmm0
5443 ; SSE-NEXT: movapd %xmm0, %xmm1
5444 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5445 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5446 ; SSE-NEXT: movapd 1184(%rdi), %xmm1
5447 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5448 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5449 ; SSE-NEXT: movapd 1136(%rdi), %xmm2
5450 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5451 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5452 ; SSE-NEXT: movapd 1200(%rdi), %xmm1
5453 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5454 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5455 ; SSE-NEXT: movapd 1152(%rdi), %xmm2
5456 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5457 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5458 ; SSE-NEXT: movapd 1216(%rdi), %xmm1
5459 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5460 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5461 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
5462 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5463 ; SSE-NEXT: movapd 1232(%rdi), %xmm2
5464 ; SSE-NEXT: movapd 1280(%rdi), %xmm0
5465 ; SSE-NEXT: movapd %xmm0, %xmm1
5466 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5467 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5468 ; SSE-NEXT: movapd 1296(%rdi), %xmm1
5469 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5470 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5471 ; SSE-NEXT: movapd 1248(%rdi), %xmm2
5472 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5473 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5474 ; SSE-NEXT: movapd 1312(%rdi), %xmm1
5475 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5476 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5477 ; SSE-NEXT: movapd 1264(%rdi), %xmm2
5478 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5479 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5480 ; SSE-NEXT: movapd 1328(%rdi), %xmm1
5481 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5482 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5483 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
5484 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5485 ; SSE-NEXT: movapd 1344(%rdi), %xmm14
5486 ; SSE-NEXT: movapd 1392(%rdi), %xmm0
5487 ; SSE-NEXT: movapd %xmm0, %xmm1
5488 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm14[0],xmm1[1]
5489 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5490 ; SSE-NEXT: movapd 1408(%rdi), %xmm1
5491 ; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm1[0]
5492 ; SSE-NEXT: movapd 1360(%rdi), %xmm2
5493 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5494 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5495 ; SSE-NEXT: movapd 1424(%rdi), %xmm1
5496 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5497 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5498 ; SSE-NEXT: movapd 1376(%rdi), %xmm2
5499 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5500 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5501 ; SSE-NEXT: movapd 1440(%rdi), %xmm1
5502 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5503 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5504 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
5505 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5506 ; SSE-NEXT: movapd 1456(%rdi), %xmm9
5507 ; SSE-NEXT: movapd 1504(%rdi), %xmm0
5508 ; SSE-NEXT: movapd %xmm0, %xmm12
5509 ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm9[0],xmm12[1]
5510 ; SSE-NEXT: movapd 1520(%rdi), %xmm13
5511 ; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm13[0]
5512 ; SSE-NEXT: movapd 1472(%rdi), %xmm2
5513 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm2[0],xmm13[1]
5514 ; SSE-NEXT: movapd 1536(%rdi), %xmm1
5515 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5516 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5517 ; SSE-NEXT: movapd 1488(%rdi), %xmm2
5518 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5519 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5520 ; SSE-NEXT: movapd 1552(%rdi), %xmm1
5521 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5522 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5523 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
5524 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5525 ; SSE-NEXT: movapd 1568(%rdi), %xmm6
5526 ; SSE-NEXT: movapd 1616(%rdi), %xmm0
5527 ; SSE-NEXT: movapd %xmm0, %xmm4
5528 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1]
5529 ; SSE-NEXT: movapd 1632(%rdi), %xmm8
5530 ; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm8[0]
5531 ; SSE-NEXT: movapd 1584(%rdi), %xmm11
5532 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm11[0],xmm8[1]
5533 ; SSE-NEXT: movapd 1648(%rdi), %xmm1
5534 ; SSE-NEXT: shufpd {{.*#+}} xmm11 = xmm11[1],xmm1[0]
5535 ; SSE-NEXT: movapd 1600(%rdi), %xmm2
5536 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
5537 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5538 ; SSE-NEXT: movapd 1664(%rdi), %xmm1
5539 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
5540 ; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill
5541 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
5542 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5543 ; SSE-NEXT: movapd 1680(%rdi), %xmm3
5544 ; SSE-NEXT: movapd 1728(%rdi), %xmm2
5545 ; SSE-NEXT: movapd %xmm2, %xmm1
5546 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
5547 ; SSE-NEXT: movapd 1744(%rdi), %xmm5
5548 ; SSE-NEXT: shufpd {{.*#+}} xmm3 = xmm3[1],xmm5[0]
5549 ; SSE-NEXT: movapd 1696(%rdi), %xmm7
5550 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1]
5551 ; SSE-NEXT: movapd 1760(%rdi), %xmm10
5552 ; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm10[0]
5553 ; SSE-NEXT: movapd 1712(%rdi), %xmm15
5554 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm15[0],xmm10[1]
5555 ; SSE-NEXT: movapd 1776(%rdi), %xmm0
5556 ; SSE-NEXT: shufpd {{.*#+}} xmm15 = xmm15[1],xmm0[0]
5557 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
5558 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5559 ; SSE-NEXT: movapd %xmm4, 224(%rsi)
5560 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5561 ; SSE-NEXT: movaps %xmm0, 160(%rsi)
5562 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5563 ; SSE-NEXT: movaps %xmm2, 96(%rsi)
5564 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5565 ; SSE-NEXT: movaps %xmm2, 32(%rsi)
5566 ; SSE-NEXT: movapd %xmm1, 240(%rsi)
5567 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5568 ; SSE-NEXT: movaps %xmm0, 176(%rsi)
5569 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5570 ; SSE-NEXT: movaps %xmm1, 112(%rsi)
5571 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5572 ; SSE-NEXT: movaps %xmm1, 48(%rsi)
5573 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5574 ; SSE-NEXT: movaps %xmm0, 192(%rsi)
5575 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5576 ; SSE-NEXT: movaps %xmm1, 128(%rsi)
5577 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5578 ; SSE-NEXT: movaps %xmm1, 64(%rsi)
5579 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5580 ; SSE-NEXT: movaps %xmm1, (%rsi)
5581 ; SSE-NEXT: movapd %xmm12, 208(%rsi)
5582 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5583 ; SSE-NEXT: movaps %xmm1, 144(%rsi)
5584 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5585 ; SSE-NEXT: movaps %xmm1, 80(%rsi)
5586 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5587 ; SSE-NEXT: movaps %xmm1, 16(%rsi)
5588 ; SSE-NEXT: movapd %xmm6, 224(%rdx)
5589 ; SSE-NEXT: movapd %xmm3, 240(%rdx)
5590 ; SSE-NEXT: movapd %xmm14, 192(%rdx)
5591 ; SSE-NEXT: movapd %xmm9, 208(%rdx)
5592 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5593 ; SSE-NEXT: movaps %xmm0, 160(%rdx)
5594 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5595 ; SSE-NEXT: movaps %xmm0, 176(%rdx)
5596 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5597 ; SSE-NEXT: movaps %xmm0, 128(%rdx)
5598 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5599 ; SSE-NEXT: movaps %xmm0, 144(%rdx)
5600 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5601 ; SSE-NEXT: movaps %xmm0, 96(%rdx)
5602 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5603 ; SSE-NEXT: movaps %xmm0, 112(%rdx)
5604 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5605 ; SSE-NEXT: movaps %xmm0, 64(%rdx)
5606 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5607 ; SSE-NEXT: movaps %xmm0, 80(%rdx)
5608 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5609 ; SSE-NEXT: movaps %xmm0, 32(%rdx)
5610 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5611 ; SSE-NEXT: movaps %xmm0, 48(%rdx)
5612 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5613 ; SSE-NEXT: movaps %xmm0, (%rdx)
5614 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5615 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
5616 ; SSE-NEXT: movapd %xmm5, 240(%rcx)
5617 ; SSE-NEXT: movapd %xmm8, 224(%rcx)
5618 ; SSE-NEXT: movapd %xmm13, 208(%rcx)
5619 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5620 ; SSE-NEXT: movaps %xmm0, 192(%rcx)
5621 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5622 ; SSE-NEXT: movaps %xmm0, 176(%rcx)
5623 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5624 ; SSE-NEXT: movaps %xmm0, 160(%rcx)
5625 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5626 ; SSE-NEXT: movaps %xmm0, 144(%rcx)
5627 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5628 ; SSE-NEXT: movaps %xmm0, 128(%rcx)
5629 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5630 ; SSE-NEXT: movaps %xmm0, 112(%rcx)
5631 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5632 ; SSE-NEXT: movaps %xmm0, 96(%rcx)
5633 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5634 ; SSE-NEXT: movaps %xmm0, 80(%rcx)
5635 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5636 ; SSE-NEXT: movaps %xmm0, 64(%rcx)
5637 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5638 ; SSE-NEXT: movaps %xmm0, 48(%rcx)
5639 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5640 ; SSE-NEXT: movaps %xmm0, 32(%rcx)
5641 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5642 ; SSE-NEXT: movaps %xmm0, 16(%rcx)
5643 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5644 ; SSE-NEXT: movaps %xmm0, (%rcx)
5645 ; SSE-NEXT: movapd %xmm7, 240(%r8)
5646 ; SSE-NEXT: movapd %xmm11, 224(%r8)
5647 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5648 ; SSE-NEXT: movaps %xmm0, 208(%r8)
5649 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5650 ; SSE-NEXT: movaps %xmm0, 192(%r8)
5651 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5652 ; SSE-NEXT: movaps %xmm0, 176(%r8)
5653 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5654 ; SSE-NEXT: movaps %xmm0, 160(%r8)
5655 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5656 ; SSE-NEXT: movaps %xmm0, 144(%r8)
5657 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5658 ; SSE-NEXT: movaps %xmm0, 128(%r8)
5659 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5660 ; SSE-NEXT: movaps %xmm0, 112(%r8)
5661 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5662 ; SSE-NEXT: movaps %xmm0, 96(%r8)
5663 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5664 ; SSE-NEXT: movaps %xmm0, 80(%r8)
5665 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5666 ; SSE-NEXT: movaps %xmm0, 64(%r8)
5667 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5668 ; SSE-NEXT: movaps %xmm0, 48(%r8)
5669 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5670 ; SSE-NEXT: movaps %xmm0, 32(%r8)
5671 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5672 ; SSE-NEXT: movaps %xmm0, 16(%r8)
5673 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5674 ; SSE-NEXT: movaps %xmm0, (%r8)
5675 ; SSE-NEXT: movapd %xmm10, 240(%r9)
5676 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5677 ; SSE-NEXT: movaps %xmm0, 224(%r9)
5678 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5679 ; SSE-NEXT: movaps %xmm0, 208(%r9)
5680 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5681 ; SSE-NEXT: movaps %xmm0, 192(%r9)
5682 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5683 ; SSE-NEXT: movaps %xmm0, 176(%r9)
5684 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5685 ; SSE-NEXT: movaps %xmm0, 160(%r9)
5686 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5687 ; SSE-NEXT: movaps %xmm0, 144(%r9)
5688 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5689 ; SSE-NEXT: movaps %xmm0, 128(%r9)
5690 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5691 ; SSE-NEXT: movaps %xmm0, 112(%r9)
5692 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5693 ; SSE-NEXT: movaps %xmm0, 96(%r9)
5694 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5695 ; SSE-NEXT: movaps %xmm0, 80(%r9)
5696 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5697 ; SSE-NEXT: movaps %xmm0, 64(%r9)
5698 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5699 ; SSE-NEXT: movaps %xmm0, 48(%r9)
5700 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5701 ; SSE-NEXT: movaps %xmm0, 32(%r9)
5702 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5703 ; SSE-NEXT: movaps %xmm0, 16(%r9)
5704 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5705 ; SSE-NEXT: movaps %xmm0, (%r9)
5706 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
5707 ; SSE-NEXT: movapd %xmm15, 240(%rax)
5708 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
5709 ; SSE-NEXT: movaps %xmm0, 224(%rax)
5710 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5711 ; SSE-NEXT: movaps %xmm0, 208(%rax)
5712 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5713 ; SSE-NEXT: movaps %xmm0, 192(%rax)
5714 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5715 ; SSE-NEXT: movaps %xmm0, 176(%rax)
5716 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5717 ; SSE-NEXT: movaps %xmm0, 160(%rax)
5718 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5719 ; SSE-NEXT: movaps %xmm0, 144(%rax)
5720 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5721 ; SSE-NEXT: movaps %xmm0, 128(%rax)
5722 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5723 ; SSE-NEXT: movaps %xmm0, 112(%rax)
5724 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5725 ; SSE-NEXT: movaps %xmm0, 96(%rax)
5726 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5727 ; SSE-NEXT: movaps %xmm0, 80(%rax)
5728 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5729 ; SSE-NEXT: movaps %xmm0, 64(%rax)
5730 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5731 ; SSE-NEXT: movaps %xmm0, 48(%rax)
5732 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5733 ; SSE-NEXT: movaps %xmm0, 32(%rax)
5734 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5735 ; SSE-NEXT: movaps %xmm0, 16(%rax)
5736 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5737 ; SSE-NEXT: movaps %xmm0, (%rax)
5738 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
5739 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5740 ; SSE-NEXT: movaps %xmm0, 240(%rax)
5741 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5742 ; SSE-NEXT: movaps %xmm0, 224(%rax)
5743 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5744 ; SSE-NEXT: movaps %xmm0, 208(%rax)
5745 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5746 ; SSE-NEXT: movaps %xmm0, 192(%rax)
5747 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5748 ; SSE-NEXT: movaps %xmm0, 176(%rax)
5749 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5750 ; SSE-NEXT: movaps %xmm0, 160(%rax)
5751 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5752 ; SSE-NEXT: movaps %xmm0, 144(%rax)
5753 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5754 ; SSE-NEXT: movaps %xmm0, 128(%rax)
5755 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5756 ; SSE-NEXT: movaps %xmm0, 112(%rax)
5757 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5758 ; SSE-NEXT: movaps %xmm0, 96(%rax)
5759 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5760 ; SSE-NEXT: movaps %xmm0, 80(%rax)
5761 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5762 ; SSE-NEXT: movaps %xmm0, 64(%rax)
5763 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5764 ; SSE-NEXT: movaps %xmm0, 48(%rax)
5765 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5766 ; SSE-NEXT: movaps %xmm0, 32(%rax)
5767 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5768 ; SSE-NEXT: movaps %xmm0, 16(%rax)
5769 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5770 ; SSE-NEXT: movaps %xmm0, (%rax)
5771 ; SSE-NEXT: addq $1448, %rsp # imm = 0x5A8
5774 ; AVX-LABEL: load_i64_stride7_vf32:
5776 ; AVX-NEXT: subq $1736, %rsp # imm = 0x6C8
5777 ; AVX-NEXT: vmovaps 1216(%rdi), %ymm1
5778 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5779 ; AVX-NEXT: vmovaps 768(%rdi), %ymm3
5780 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5781 ; AVX-NEXT: vmovaps 320(%rdi), %ymm7
5782 ; AVX-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm0
5783 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5784 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
5785 ; AVX-NEXT: vmovaps 224(%rdi), %xmm5
5786 ; AVX-NEXT: vmovaps 272(%rdi), %xmm2
5787 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5788 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
5789 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
5790 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5791 ; AVX-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm0
5792 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5793 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
5794 ; AVX-NEXT: vmovaps 672(%rdi), %xmm6
5795 ; AVX-NEXT: vmovaps 720(%rdi), %xmm2
5796 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5797 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm6[0,1],xmm2[2,3]
5798 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
5799 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5800 ; AVX-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm0
5801 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5802 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5803 ; AVX-NEXT: vmovaps 1120(%rdi), %xmm11
5804 ; AVX-NEXT: vmovaps 1168(%rdi), %xmm2
5805 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5806 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm11[0,1],xmm2[2,3]
5807 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
5808 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5809 ; AVX-NEXT: vmovaps 1664(%rdi), %ymm14
5810 ; AVX-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0
5811 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5812 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
5813 ; AVX-NEXT: vmovaps 1568(%rdi), %xmm8
5814 ; AVX-NEXT: vmovaps 1616(%rdi), %xmm2
5815 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5816 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm8[0,1],xmm2[2,3]
5817 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
5818 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5819 ; AVX-NEXT: vmovapd 96(%rdi), %ymm4
5820 ; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm0
5821 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5822 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3]
5823 ; AVX-NEXT: vmovapd 48(%rdi), %xmm1
5824 ; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5825 ; AVX-NEXT: vblendpd {{.*#+}} xmm2 = mem[0],xmm1[1]
5826 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
5827 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5828 ; AVX-NEXT: vmovapd 544(%rdi), %ymm9
5829 ; AVX-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm0
5830 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5831 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm9[0,1,2],ymm0[3]
5832 ; AVX-NEXT: vmovapd 448(%rdi), %xmm10
5833 ; AVX-NEXT: vmovapd 496(%rdi), %xmm1
5834 ; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5835 ; AVX-NEXT: vblendpd {{.*#+}} xmm2 = xmm10[0],xmm1[1]
5836 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
5837 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5838 ; AVX-NEXT: vmovapd 992(%rdi), %ymm15
5839 ; AVX-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm0
5840 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5841 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm15[0,1,2],ymm0[3]
5842 ; AVX-NEXT: vmovapd 896(%rdi), %xmm12
5843 ; AVX-NEXT: vmovapd 944(%rdi), %xmm1
5844 ; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5845 ; AVX-NEXT: vblendpd {{.*#+}} xmm2 = xmm12[0],xmm1[1]
5846 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
5847 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5848 ; AVX-NEXT: vmovapd 1440(%rdi), %ymm2
5849 ; AVX-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm0
5850 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5851 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1,2],ymm0[3]
5852 ; AVX-NEXT: vmovapd 1344(%rdi), %xmm3
5853 ; AVX-NEXT: vmovapd 1392(%rdi), %xmm0
5854 ; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5855 ; AVX-NEXT: vblendpd {{.*#+}} xmm13 = xmm3[0],xmm0[1]
5856 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm13[0,1],ymm1[2,3]
5857 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5858 ; AVX-NEXT: vmovdqa 288(%rdi), %xmm0
5859 ; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
5860 ; AVX-NEXT: vmovapd 384(%rdi), %ymm0
5861 ; AVX-NEXT: vshufpd {{.*#+}} ymm7 = ymm7[0],ymm0[1],ymm7[3],ymm0[2]
5862 ; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm7[2,3]
5863 ; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5864 ; AVX-NEXT: vmovdqa 736(%rdi), %xmm5
5865 ; AVX-NEXT: vpalignr {{.*#+}} xmm5 = xmm6[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
5866 ; AVX-NEXT: vmovapd 832(%rdi), %ymm7
5867 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5868 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm7[1],ymm1[3],ymm7[2]
5869 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1],ymm1[2,3]
5870 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5871 ; AVX-NEXT: vmovdqa 1184(%rdi), %xmm1
5872 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
5873 ; AVX-NEXT: vmovapd 1280(%rdi), %ymm6
5874 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5875 ; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[3],ymm6[2]
5876 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3]
5877 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5878 ; AVX-NEXT: vmovdqa 1632(%rdi), %xmm1
5879 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm8[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
5880 ; AVX-NEXT: vmovapd 1728(%rdi), %ymm11
5881 ; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm14[0],ymm11[1],ymm14[3],ymm11[2]
5882 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm5[2,3]
5883 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5884 ; AVX-NEXT: vmovapd 160(%rdi), %ymm8
5885 ; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[0],ymm8[1],ymm4[3],ymm8[2]
5886 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm1
5887 ; AVX-NEXT: vpalignr {{.*#+}} xmm5 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
5888 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3]
5889 ; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5890 ; AVX-NEXT: vmovapd 608(%rdi), %ymm14
5891 ; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm9[0],ymm14[1],ymm9[3],ymm14[2]
5892 ; AVX-NEXT: vmovdqa 512(%rdi), %xmm4
5893 ; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
5894 ; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3]
5895 ; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5896 ; AVX-NEXT: vmovapd 1056(%rdi), %ymm13
5897 ; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm15[0],ymm13[1],ymm15[3],ymm13[2]
5898 ; AVX-NEXT: vmovdqa 960(%rdi), %xmm15
5899 ; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm12[8,9,10,11,12,13,14,15],xmm15[0,1,2,3,4,5,6,7]
5900 ; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3]
5901 ; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5902 ; AVX-NEXT: vmovapd 1504(%rdi), %ymm5
5903 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm5[1],ymm2[3],ymm5[2]
5904 ; AVX-NEXT: vmovdqa 1408(%rdi), %xmm9
5905 ; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
5906 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
5907 ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5908 ; AVX-NEXT: vmovdqa 352(%rdi), %xmm3
5909 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm2
5910 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3]
5911 ; AVX-NEXT: vmovapd 240(%rdi), %xmm2
5912 ; AVX-NEXT: vblendpd {{.*#+}} xmm10 = xmm2[0],mem[1]
5913 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,3]
5914 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5915 ; AVX-NEXT: vmovdqa 800(%rdi), %xmm0
5916 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm10
5917 ; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3]
5918 ; AVX-NEXT: vmovapd 688(%rdi), %xmm10
5919 ; AVX-NEXT: vblendpd {{.*#+}} xmm12 = xmm10[0],mem[1]
5920 ; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3]
5921 ; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5922 ; AVX-NEXT: vmovaps 1248(%rdi), %xmm7
5923 ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5924 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm12
5925 ; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm12[0,1,2],ymm6[3]
5926 ; AVX-NEXT: vmovapd 1136(%rdi), %xmm6
5927 ; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm6[0],mem[1]
5928 ; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1],ymm12[2,3]
5929 ; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5930 ; AVX-NEXT: vmovaps 1696(%rdi), %xmm7
5931 ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5932 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
5933 ; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm11[3]
5934 ; AVX-NEXT: vmovapd 1584(%rdi), %xmm11
5935 ; AVX-NEXT: vblendpd {{.*#+}} xmm12 = xmm11[0],mem[1]
5936 ; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm12[0,1],ymm7[2,3]
5937 ; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5938 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm12
5939 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm7
5940 ; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm8[3]
5941 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm8
5942 ; AVX-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5943 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4,5,6,7]
5944 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm7[2,3]
5945 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5946 ; AVX-NEXT: vmovaps 576(%rdi), %xmm1
5947 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5948 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
5949 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm14[3]
5950 ; AVX-NEXT: vmovdqa 464(%rdi), %xmm8
5951 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm8[0,1,2,3],xmm4[4,5,6,7]
5952 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3]
5953 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5954 ; AVX-NEXT: vmovdqa 1024(%rdi), %xmm14
5955 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm1
5956 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm13[3]
5957 ; AVX-NEXT: vmovdqa 912(%rdi), %xmm1
5958 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm1[0,1,2,3],xmm15[4,5,6,7]
5959 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm7[0,1],ymm4[2,3]
5960 ; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5961 ; AVX-NEXT: vmovdqa 1472(%rdi), %xmm4
5962 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm7
5963 ; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3]
5964 ; AVX-NEXT: vmovdqa 1360(%rdi), %xmm15
5965 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm15[0,1,2,3],xmm9[4,5,6,7]
5966 ; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm7[0,1],ymm5[2,3]
5967 ; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5968 ; AVX-NEXT: vmovapd 304(%rdi), %xmm9
5969 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm9[0],ymm2[2],ymm9[3]
5970 ; AVX-NEXT: vmovdqa 416(%rdi), %xmm13
5971 ; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
5972 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
5973 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3]
5974 ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5975 ; AVX-NEXT: vmovapd 752(%rdi), %xmm5
5976 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm10[1],ymm5[0],ymm10[2],ymm5[3]
5977 ; AVX-NEXT: vmovdqa 864(%rdi), %xmm10
5978 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
5979 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5980 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
5981 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5982 ; AVX-NEXT: vmovapd 1200(%rdi), %xmm0
5983 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5984 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm6[1],ymm0[0],ymm6[2],ymm0[3]
5985 ; AVX-NEXT: vmovdqa 1312(%rdi), %xmm6
5986 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm2 # 16-byte Folded Reload
5987 ; AVX-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
5988 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
5989 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
5990 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5991 ; AVX-NEXT: vmovapd 1648(%rdi), %xmm0
5992 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5993 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[1],ymm0[0],ymm11[2],ymm0[3]
5994 ; AVX-NEXT: vmovdqa 1760(%rdi), %xmm2
5995 ; AVX-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
5996 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
5997 ; AVX-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
5998 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
5999 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
6000 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6001 ; AVX-NEXT: vmovapd 1424(%rdi), %xmm0
6002 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6003 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm15[1],ymm0[0],ymm15[2],ymm0[3]
6004 ; AVX-NEXT: vmovdqa 1536(%rdi), %xmm11
6005 ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm4[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
6006 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
6007 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
6008 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6009 ; AVX-NEXT: vmovapd 976(%rdi), %xmm15
6010 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[1],ymm15[0],ymm1[2],ymm15[3]
6011 ; AVX-NEXT: vmovdqa 1088(%rdi), %xmm1
6012 ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
6013 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
6014 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
6015 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6016 ; AVX-NEXT: vmovapd 528(%rdi), %xmm0
6017 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm8[1],ymm0[0],ymm8[2],ymm0[3]
6018 ; AVX-NEXT: vmovdqa 640(%rdi), %xmm4
6019 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm3 # 16-byte Folded Reload
6020 ; AVX-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
6021 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
6022 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3]
6023 ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6024 ; AVX-NEXT: vmovapd 80(%rdi), %xmm3
6025 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6026 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[1],ymm3[0],ymm2[2],ymm3[3]
6027 ; AVX-NEXT: vmovdqa 192(%rdi), %xmm7
6028 ; AVX-NEXT: vpalignr {{.*#+}} xmm8 = xmm12[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
6029 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
6030 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm8[2,3]
6031 ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6032 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
6033 ; AVX-NEXT: vmovapd 128(%rdi), %ymm2
6034 ; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm2[0,1,2],ymm7[3]
6035 ; AVX-NEXT: vblendpd {{.*#+}} xmm3 = mem[0],xmm3[1]
6036 ; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm7[2,3]
6037 ; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6038 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm3
6039 ; AVX-NEXT: vmovapd 352(%rdi), %ymm7
6040 ; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6041 ; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm7[0,1,2],ymm3[3]
6042 ; AVX-NEXT: vmovapd 256(%rdi), %xmm3
6043 ; AVX-NEXT: vblendpd {{.*#+}} xmm8 = xmm3[0],xmm9[1]
6044 ; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3]
6045 ; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6046 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
6047 ; AVX-NEXT: vmovapd 576(%rdi), %ymm7
6048 ; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6049 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3]
6050 ; AVX-NEXT: vmovapd 480(%rdi), %xmm7
6051 ; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
6052 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3]
6053 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6054 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm4
6055 ; AVX-NEXT: vmovapd 800(%rdi), %ymm0
6056 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm0[0,1,2],ymm4[3]
6057 ; AVX-NEXT: vmovapd 704(%rdi), %xmm14
6058 ; AVX-NEXT: vblendpd {{.*#+}} xmm5 = xmm14[0],xmm5[1]
6059 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3]
6060 ; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6061 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
6062 ; AVX-NEXT: vmovapd 1024(%rdi), %ymm4
6063 ; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6064 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1,2],ymm1[3]
6065 ; AVX-NEXT: vmovapd 928(%rdi), %xmm9
6066 ; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm9[0],xmm15[1]
6067 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm4[0,1],ymm1[2,3]
6068 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6069 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm4
6070 ; AVX-NEXT: vmovapd 1248(%rdi), %ymm1
6071 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm1[0,1,2],ymm4[3]
6072 ; AVX-NEXT: vmovapd 1152(%rdi), %xmm15
6073 ; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm5 # 16-byte Folded Reload
6074 ; AVX-NEXT: # xmm5 = xmm15[0],mem[1]
6075 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3]
6076 ; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6077 ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm4
6078 ; AVX-NEXT: vmovaps 1472(%rdi), %ymm5
6079 ; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6080 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
6081 ; AVX-NEXT: vmovaps 1376(%rdi), %xmm8
6082 ; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload
6083 ; AVX-NEXT: # xmm5 = xmm8[0,1],mem[2,3]
6084 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
6085 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6086 ; AVX-NEXT: vinsertf128 $1, (%rsp), %ymm0, %ymm4 # 16-byte Folded Reload
6087 ; AVX-NEXT: vmovapd 1696(%rdi), %ymm12
6088 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3]
6089 ; AVX-NEXT: vmovapd 1600(%rdi), %xmm13
6090 ; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm5 # 16-byte Folded Reload
6091 ; AVX-NEXT: # xmm5 = xmm13[0],mem[1]
6092 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3]
6093 ; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6094 ; AVX-NEXT: vmovapd 192(%rdi), %ymm11
6095 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm11[1],ymm2[3],ymm11[2]
6096 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm4
6097 ; AVX-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
6098 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm4[0,1],ymm2[2,3]
6099 ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6100 ; AVX-NEXT: vmovapd 416(%rdi), %ymm10
6101 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6102 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm10[1],ymm2[3],ymm10[2]
6103 ; AVX-NEXT: vmovdqa 320(%rdi), %xmm6
6104 ; AVX-NEXT: vpalignr {{.*#+}} xmm3 = xmm3[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
6105 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3]
6106 ; AVX-NEXT: vmovupd %ymm2, (%rsp) # 32-byte Spill
6107 ; AVX-NEXT: vmovdqa 544(%rdi), %xmm2
6108 ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm7[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
6109 ; AVX-NEXT: vmovapd 640(%rdi), %ymm5
6110 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6111 ; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[3],ymm5[2]
6112 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3]
6113 ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6114 ; AVX-NEXT: vmovapd 864(%rdi), %ymm7
6115 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm7[1],ymm0[3],ymm7[2]
6116 ; AVX-NEXT: vmovdqa 768(%rdi), %xmm4
6117 ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm14[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
6118 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,3]
6119 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6120 ; AVX-NEXT: vmovdqa 992(%rdi), %xmm0
6121 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm9[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
6122 ; AVX-NEXT: vmovapd 1088(%rdi), %ymm3
6123 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6124 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[3],ymm3[2]
6125 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
6126 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6127 ; AVX-NEXT: vmovapd 1312(%rdi), %ymm9
6128 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm9[1],ymm1[3],ymm9[2]
6129 ; AVX-NEXT: vmovdqa 1216(%rdi), %xmm2
6130 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm15[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
6131 ; AVX-NEXT: vblendpd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3]
6132 ; AVX-NEXT: vmovdqa 1440(%rdi), %xmm0
6133 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
6134 ; AVX-NEXT: vmovapd 1536(%rdi), %ymm8
6135 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6136 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm8[1],ymm1[3],ymm8[2]
6137 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
6138 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6139 ; AVX-NEXT: vmovapd 1760(%rdi), %ymm1
6140 ; AVX-NEXT: vshufpd {{.*#+}} ymm12 = ymm12[0],ymm1[1],ymm12[3],ymm1[2]
6141 ; AVX-NEXT: vmovdqa 1664(%rdi), %xmm0
6142 ; AVX-NEXT: vpalignr {{.*#+}} xmm13 = xmm13[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
6143 ; AVX-NEXT: vblendpd {{.*#+}} ymm12 = ymm13[0,1],ymm12[2,3]
6144 ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
6145 ; AVX-NEXT: # ymm11 = mem[0,1,2],ymm11[3]
6146 ; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
6147 ; AVX-NEXT: vblendpd {{.*#+}} xmm13 = xmm13[0],mem[1]
6148 ; AVX-NEXT: vblendpd {{.*#+}} ymm14 = ymm13[0,1],ymm11[2,3]
6149 ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
6150 ; AVX-NEXT: # ymm10 = mem[0,1,2],ymm10[3]
6151 ; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
6152 ; AVX-NEXT: # xmm6 = mem[0,1,2,3],xmm6[4,5,6,7]
6153 ; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm6[0,1],ymm10[2,3]
6154 ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm5 # 32-byte Folded Reload
6155 ; AVX-NEXT: # ymm5 = mem[0,1,2],ymm5[3]
6156 ; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
6157 ; AVX-NEXT: vblendpd {{.*#+}} xmm6 = xmm6[0],mem[1]
6158 ; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm5[2,3]
6159 ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm5 # 32-byte Folded Reload
6160 ; AVX-NEXT: # ymm5 = mem[0,1,2],ymm7[3]
6161 ; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
6162 ; AVX-NEXT: # xmm4 = mem[0,1,2,3],xmm4[4,5,6,7]
6163 ; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm4[0,1],ymm5[2,3]
6164 ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
6165 ; AVX-NEXT: # ymm3 = mem[0,1,2],ymm3[3]
6166 ; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
6167 ; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1]
6168 ; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm4[0,1],ymm3[2,3]
6169 ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload
6170 ; AVX-NEXT: # ymm4 = mem[0,1,2],ymm9[3]
6171 ; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
6172 ; AVX-NEXT: # xmm2 = mem[0,1,2,3],xmm2[4,5,6,7]
6173 ; AVX-NEXT: vblendpd {{.*#+}} ymm7 = ymm2[0,1],ymm4[2,3]
6174 ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm2 # 32-byte Folded Reload
6175 ; AVX-NEXT: # ymm2 = mem[0,1,2],ymm8[3]
6176 ; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
6177 ; AVX-NEXT: vblendpd {{.*#+}} xmm4 = xmm4[0],mem[1]
6178 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm2[2,3]
6179 ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6180 ; AVX-NEXT: # ymm1 = mem[0,1,2],ymm1[3]
6181 ; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
6182 ; AVX-NEXT: # xmm0 = mem[0,1,2,3],xmm0[4,5,6,7]
6183 ; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm0[0,1],ymm1[2,3]
6184 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6185 ; AVX-NEXT: vmovaps %ymm0, 192(%rsi)
6186 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6187 ; AVX-NEXT: vmovaps %ymm0, 128(%rsi)
6188 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6189 ; AVX-NEXT: vmovaps %ymm0, 64(%rsi)
6190 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6191 ; AVX-NEXT: vmovaps %ymm0, (%rsi)
6192 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6193 ; AVX-NEXT: vmovaps %ymm0, 224(%rsi)
6194 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6195 ; AVX-NEXT: vmovaps %ymm0, 160(%rsi)
6196 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6197 ; AVX-NEXT: vmovaps %ymm0, 96(%rsi)
6198 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6199 ; AVX-NEXT: vmovaps %ymm0, 32(%rsi)
6200 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6201 ; AVX-NEXT: vmovaps %ymm0, 192(%rdx)
6202 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6203 ; AVX-NEXT: vmovaps %ymm0, 128(%rdx)
6204 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6205 ; AVX-NEXT: vmovaps %ymm0, 64(%rdx)
6206 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6207 ; AVX-NEXT: vmovaps %ymm0, (%rdx)
6208 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6209 ; AVX-NEXT: vmovaps %ymm0, 224(%rdx)
6210 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6211 ; AVX-NEXT: vmovaps %ymm0, 160(%rdx)
6212 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6213 ; AVX-NEXT: vmovaps %ymm0, 96(%rdx)
6214 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6215 ; AVX-NEXT: vmovaps %ymm0, 32(%rdx)
6216 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6217 ; AVX-NEXT: vmovaps %ymm0, 192(%rcx)
6218 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6219 ; AVX-NEXT: vmovaps %ymm0, 128(%rcx)
6220 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6221 ; AVX-NEXT: vmovaps %ymm0, 64(%rcx)
6222 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6223 ; AVX-NEXT: vmovaps %ymm0, (%rcx)
6224 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6225 ; AVX-NEXT: vmovaps %ymm0, 224(%rcx)
6226 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6227 ; AVX-NEXT: vmovaps %ymm0, 160(%rcx)
6228 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6229 ; AVX-NEXT: vmovaps %ymm0, 96(%rcx)
6230 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6231 ; AVX-NEXT: vmovaps %ymm0, 32(%rcx)
6232 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6233 ; AVX-NEXT: vmovaps %ymm0, (%r8)
6234 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6235 ; AVX-NEXT: vmovaps %ymm0, 64(%r8)
6236 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6237 ; AVX-NEXT: vmovaps %ymm0, 128(%r8)
6238 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6239 ; AVX-NEXT: vmovaps %ymm0, 192(%r8)
6240 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6241 ; AVX-NEXT: vmovaps %ymm0, 224(%r8)
6242 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6243 ; AVX-NEXT: vmovaps %ymm0, 160(%r8)
6244 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6245 ; AVX-NEXT: vmovaps %ymm0, 96(%r8)
6246 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6247 ; AVX-NEXT: vmovaps %ymm0, 32(%r8)
6248 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6249 ; AVX-NEXT: vmovaps %ymm0, 224(%r9)
6250 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6251 ; AVX-NEXT: vmovaps %ymm0, 192(%r9)
6252 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6253 ; AVX-NEXT: vmovaps %ymm0, 160(%r9)
6254 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6255 ; AVX-NEXT: vmovaps %ymm0, 128(%r9)
6256 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6257 ; AVX-NEXT: vmovaps %ymm0, 96(%r9)
6258 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6259 ; AVX-NEXT: vmovaps %ymm0, 64(%r9)
6260 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6261 ; AVX-NEXT: vmovaps %ymm0, 32(%r9)
6262 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6263 ; AVX-NEXT: vmovaps %ymm0, (%r9)
6264 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
6265 ; AVX-NEXT: vmovapd %ymm12, 224(%rax)
6266 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6267 ; AVX-NEXT: vmovaps %ymm0, 192(%rax)
6268 ; AVX-NEXT: vmovapd %ymm15, 160(%rax)
6269 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6270 ; AVX-NEXT: vmovaps %ymm0, 128(%rax)
6271 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6272 ; AVX-NEXT: vmovaps %ymm0, 96(%rax)
6273 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6274 ; AVX-NEXT: vmovaps %ymm0, 64(%rax)
6275 ; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
6276 ; AVX-NEXT: vmovaps %ymm0, 32(%rax)
6277 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6278 ; AVX-NEXT: vmovaps %ymm0, (%rax)
6279 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
6280 ; AVX-NEXT: vmovapd %ymm10, 224(%rax)
6281 ; AVX-NEXT: vmovapd %ymm4, 192(%rax)
6282 ; AVX-NEXT: vmovapd %ymm7, 160(%rax)
6283 ; AVX-NEXT: vmovapd %ymm3, 128(%rax)
6284 ; AVX-NEXT: vmovapd %ymm5, 96(%rax)
6285 ; AVX-NEXT: vmovapd %ymm6, 64(%rax)
6286 ; AVX-NEXT: vmovapd %ymm11, 32(%rax)
6287 ; AVX-NEXT: vmovapd %ymm14, (%rax)
6288 ; AVX-NEXT: addq $1736, %rsp # imm = 0x6C8
6289 ; AVX-NEXT: vzeroupper
6292 ; AVX2-LABEL: load_i64_stride7_vf32:
6294 ; AVX2-NEXT: subq $1576, %rsp # imm = 0x628
6295 ; AVX2-NEXT: vmovdqa 1216(%rdi), %ymm1
6296 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6297 ; AVX2-NEXT: vmovdqa 768(%rdi), %ymm6
6298 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6299 ; AVX2-NEXT: vmovdqa 320(%rdi), %ymm2
6300 ; AVX2-NEXT: vmovdqa 384(%rdi), %xmm0
6301 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6302 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
6303 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7]
6304 ; AVX2-NEXT: vmovdqa 224(%rdi), %xmm4
6305 ; AVX2-NEXT: vmovdqa 272(%rdi), %xmm0
6306 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6307 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm4[0,1],xmm0[2,3]
6308 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7]
6309 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6310 ; AVX2-NEXT: vmovdqa 832(%rdi), %xmm0
6311 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6312 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
6313 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
6314 ; AVX2-NEXT: vmovdqa 672(%rdi), %xmm5
6315 ; AVX2-NEXT: vmovdqa 720(%rdi), %xmm0
6316 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6317 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1],xmm0[2,3]
6318 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7]
6319 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6320 ; AVX2-NEXT: vmovdqa 1280(%rdi), %xmm0
6321 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6322 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
6323 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm3[6,7]
6324 ; AVX2-NEXT: vmovdqa 1120(%rdi), %xmm6
6325 ; AVX2-NEXT: vmovdqa 1168(%rdi), %xmm0
6326 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6327 ; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm0[2,3]
6328 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm3[4,5,6,7]
6329 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6330 ; AVX2-NEXT: vmovdqa 1664(%rdi), %ymm3
6331 ; AVX2-NEXT: vmovdqa 1728(%rdi), %xmm0
6332 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6333 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7
6334 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3,4,5],ymm7[6,7]
6335 ; AVX2-NEXT: vmovdqa 1568(%rdi), %xmm8
6336 ; AVX2-NEXT: vmovdqa 1616(%rdi), %xmm0
6337 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6338 ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm8[0,1],xmm0[2,3]
6339 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm7[4,5,6,7]
6340 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6341 ; AVX2-NEXT: vmovdqa 96(%rdi), %ymm7
6342 ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm0
6343 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6344 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9
6345 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],ymm9[6,7]
6346 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm0
6347 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6348 ; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = mem[0,1],xmm0[2,3]
6349 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7]
6350 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6351 ; AVX2-NEXT: vmovdqa 544(%rdi), %ymm9
6352 ; AVX2-NEXT: vmovdqa 608(%rdi), %xmm0
6353 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6354 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10
6355 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5],ymm10[6,7]
6356 ; AVX2-NEXT: vmovdqa 448(%rdi), %xmm10
6357 ; AVX2-NEXT: vmovdqa 496(%rdi), %xmm0
6358 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6359 ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm10[0,1],xmm0[2,3]
6360 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm11[4,5,6,7]
6361 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6362 ; AVX2-NEXT: vmovdqa 992(%rdi), %ymm11
6363 ; AVX2-NEXT: vmovdqa 1056(%rdi), %xmm0
6364 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6365 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12
6366 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5],ymm12[6,7]
6367 ; AVX2-NEXT: vmovdqa 896(%rdi), %xmm12
6368 ; AVX2-NEXT: vmovdqa 944(%rdi), %xmm0
6369 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6370 ; AVX2-NEXT: vpblendd {{.*#+}} xmm14 = xmm12[0,1],xmm0[2,3]
6371 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm13[4,5,6,7]
6372 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6373 ; AVX2-NEXT: vmovdqa 1440(%rdi), %ymm13
6374 ; AVX2-NEXT: vmovdqa 1504(%rdi), %xmm0
6375 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6376 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm14
6377 ; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5],ymm14[6,7]
6378 ; AVX2-NEXT: vmovdqa 1344(%rdi), %xmm14
6379 ; AVX2-NEXT: vmovdqa 1392(%rdi), %xmm0
6380 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6381 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3]
6382 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
6383 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6384 ; AVX2-NEXT: vmovdqa 288(%rdi), %xmm0
6385 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
6386 ; AVX2-NEXT: vmovdqa 384(%rdi), %ymm4
6387 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
6388 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
6389 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6390 ; AVX2-NEXT: vmovdqa 736(%rdi), %xmm0
6391 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
6392 ; AVX2-NEXT: vmovdqa 832(%rdi), %ymm2
6393 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload
6394 ; AVX2-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
6395 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6396 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6397 ; AVX2-NEXT: vmovdqa 1184(%rdi), %xmm0
6398 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
6399 ; AVX2-NEXT: vmovdqa 1280(%rdi), %ymm5
6400 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload
6401 ; AVX2-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
6402 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6403 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6404 ; AVX2-NEXT: vmovdqa 1632(%rdi), %xmm0
6405 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
6406 ; AVX2-NEXT: vmovdqa 1728(%rdi), %ymm8
6407 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
6408 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6409 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6410 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm0
6411 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
6412 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm3
6413 ; AVX2-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
6414 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
6415 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6416 ; AVX2-NEXT: vmovdqa 608(%rdi), %ymm1
6417 ; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
6418 ; AVX2-NEXT: vmovdqa 512(%rdi), %xmm7
6419 ; AVX2-NEXT: vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
6420 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
6421 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6422 ; AVX2-NEXT: vmovdqa 1056(%rdi), %ymm6
6423 ; AVX2-NEXT: vpalignr {{.*#+}} ymm9 = ymm11[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
6424 ; AVX2-NEXT: vmovdqa 960(%rdi), %xmm10
6425 ; AVX2-NEXT: vpalignr {{.*#+}} xmm11 = xmm12[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
6426 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
6427 ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6428 ; AVX2-NEXT: vmovdqa 1504(%rdi), %ymm9
6429 ; AVX2-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
6430 ; AVX2-NEXT: vmovdqa 1408(%rdi), %xmm11
6431 ; AVX2-NEXT: vpalignr {{.*#+}} xmm13 = xmm14[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
6432 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
6433 ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6434 ; AVX2-NEXT: vpbroadcastq 352(%rdi), %ymm12
6435 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm12[1],ymm4[1],ymm12[3],ymm4[3]
6436 ; AVX2-NEXT: vmovdqa 240(%rdi), %xmm12
6437 ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3]
6438 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7]
6439 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6440 ; AVX2-NEXT: vpbroadcastq 800(%rdi), %ymm4
6441 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3]
6442 ; AVX2-NEXT: vmovdqa 688(%rdi), %xmm4
6443 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
6444 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
6445 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6446 ; AVX2-NEXT: vpbroadcastq 1248(%rdi), %ymm2
6447 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3]
6448 ; AVX2-NEXT: vmovdqa 1136(%rdi), %xmm4
6449 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
6450 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
6451 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6452 ; AVX2-NEXT: vpbroadcastq 1696(%rdi), %ymm2
6453 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3]
6454 ; AVX2-NEXT: vmovdqa 1584(%rdi), %xmm4
6455 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
6456 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
6457 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6458 ; AVX2-NEXT: vpbroadcastq 128(%rdi), %ymm2
6459 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
6460 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm3[2,3]
6461 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6462 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6463 ; AVX2-NEXT: vpbroadcastq 576(%rdi), %ymm0
6464 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6465 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm7[2,3]
6466 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6467 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6468 ; AVX2-NEXT: vpbroadcastq 1024(%rdi), %ymm0
6469 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3]
6470 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm10[2,3]
6471 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6472 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6473 ; AVX2-NEXT: vpbroadcastq 1472(%rdi), %ymm0
6474 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3]
6475 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm11[2,3]
6476 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6477 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6478 ; AVX2-NEXT: vmovdqa 288(%rdi), %ymm0
6479 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
6480 ; AVX2-NEXT: vmovdqa 416(%rdi), %xmm0
6481 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
6482 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6483 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
6484 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6485 ; AVX2-NEXT: vmovdqa 736(%rdi), %ymm1
6486 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
6487 ; AVX2-NEXT: vmovdqa 864(%rdi), %xmm3
6488 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
6489 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6490 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
6491 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6492 ; AVX2-NEXT: vmovdqa 1184(%rdi), %ymm1
6493 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
6494 ; AVX2-NEXT: vmovdqa 1312(%rdi), %xmm7
6495 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
6496 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6497 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
6498 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6499 ; AVX2-NEXT: vmovdqa 1632(%rdi), %ymm1
6500 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
6501 ; AVX2-NEXT: vmovdqa 1760(%rdi), %xmm6
6502 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
6503 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6504 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
6505 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6506 ; AVX2-NEXT: vmovdqa 1408(%rdi), %ymm1
6507 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
6508 ; AVX2-NEXT: vmovdqa 1536(%rdi), %xmm5
6509 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
6510 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6511 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
6512 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6513 ; AVX2-NEXT: vmovdqa 960(%rdi), %ymm1
6514 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
6515 ; AVX2-NEXT: vmovdqa 1088(%rdi), %xmm4
6516 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
6517 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6518 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
6519 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6520 ; AVX2-NEXT: vmovdqa 512(%rdi), %ymm1
6521 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
6522 ; AVX2-NEXT: vmovdqa 640(%rdi), %xmm8
6523 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
6524 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6525 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
6526 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6527 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm1
6528 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
6529 ; AVX2-NEXT: vmovdqa 192(%rdi), %xmm2
6530 ; AVX2-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
6531 ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
6532 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm9[2,3]
6533 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6534 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1
6535 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm12
6536 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7]
6537 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
6538 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
6539 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6540 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6541 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6542 ; AVX2-NEXT: vmovdqa 352(%rdi), %ymm14
6543 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
6544 ; AVX2-NEXT: vmovdqa 256(%rdi), %xmm2
6545 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],mem[2,3]
6546 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6547 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6548 ; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0
6549 ; AVX2-NEXT: vmovdqa 576(%rdi), %ymm1
6550 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6551 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
6552 ; AVX2-NEXT: vmovdqa 480(%rdi), %xmm11
6553 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],mem[2,3]
6554 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6555 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6556 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
6557 ; AVX2-NEXT: vmovdqa 800(%rdi), %ymm10
6558 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7]
6559 ; AVX2-NEXT: vmovdqa 704(%rdi), %xmm1
6560 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],mem[2,3]
6561 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
6562 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6563 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
6564 ; AVX2-NEXT: vmovdqa 1024(%rdi), %ymm3
6565 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6566 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
6567 ; AVX2-NEXT: vmovdqa 928(%rdi), %xmm4
6568 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],mem[2,3]
6569 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
6570 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6571 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0
6572 ; AVX2-NEXT: vmovdqa 1248(%rdi), %ymm8
6573 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm0[6,7]
6574 ; AVX2-NEXT: vmovdqa 1152(%rdi), %xmm0
6575 ; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],mem[2,3]
6576 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7]
6577 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6578 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3
6579 ; AVX2-NEXT: vmovdqa 1472(%rdi), %ymm15
6580 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm3[6,7]
6581 ; AVX2-NEXT: vmovdqa 1376(%rdi), %xmm3
6582 ; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0,1],mem[2,3]
6583 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
6584 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6585 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5
6586 ; AVX2-NEXT: vmovdqa 1696(%rdi), %ymm7
6587 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7]
6588 ; AVX2-NEXT: vmovdqa 1600(%rdi), %xmm9
6589 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],mem[2,3]
6590 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
6591 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6592 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm13
6593 ; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
6594 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm6
6595 ; AVX2-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
6596 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
6597 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6598 ; AVX2-NEXT: vmovdqa 416(%rdi), %ymm12
6599 ; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm14[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
6600 ; AVX2-NEXT: vmovdqa 320(%rdi), %xmm6
6601 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
6602 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
6603 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6604 ; AVX2-NEXT: vmovdqa 544(%rdi), %xmm2
6605 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm11[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
6606 ; AVX2-NEXT: vmovdqa 640(%rdi), %ymm5
6607 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload
6608 ; AVX2-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
6609 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7]
6610 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6611 ; AVX2-NEXT: vmovdqa 864(%rdi), %ymm11
6612 ; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
6613 ; AVX2-NEXT: vmovdqa 768(%rdi), %xmm10
6614 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
6615 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
6616 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6617 ; AVX2-NEXT: vmovdqa 992(%rdi), %xmm1
6618 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
6619 ; AVX2-NEXT: vmovdqa 1088(%rdi), %ymm2
6620 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload
6621 ; AVX2-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
6622 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
6623 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6624 ; AVX2-NEXT: vmovdqa 1312(%rdi), %ymm4
6625 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
6626 ; AVX2-NEXT: vmovdqa 1216(%rdi), %xmm8
6627 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
6628 ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6629 ; AVX2-NEXT: vmovdqa 1440(%rdi), %xmm0
6630 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
6631 ; AVX2-NEXT: vmovdqa 1536(%rdi), %ymm0
6632 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
6633 ; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm3[4,5,6,7]
6634 ; AVX2-NEXT: vmovdqa 1760(%rdi), %ymm3
6635 ; AVX2-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
6636 ; AVX2-NEXT: vmovdqa 1664(%rdi), %xmm1
6637 ; AVX2-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
6638 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
6639 ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload
6640 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm13[1],ymm9[3],ymm13[3]
6641 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
6642 ; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
6643 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
6644 ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload
6645 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
6646 ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
6647 ; AVX2-NEXT: # xmm6 = mem[0,1],xmm6[2,3]
6648 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
6649 ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 16-byte Folded Reload
6650 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm5[1],ymm12[3],ymm5[3]
6651 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
6652 ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3]
6653 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm5[4,5,6,7]
6654 ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload
6655 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm11[1],ymm5[3],ymm11[3]
6656 ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
6657 ; AVX2-NEXT: # xmm10 = mem[0,1],xmm10[2,3]
6658 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm5[4,5,6,7]
6659 ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload
6660 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3]
6661 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6662 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
6663 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
6664 ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload
6665 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
6666 ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload
6667 ; AVX2-NEXT: # xmm5 = mem[0,1],xmm8[2,3]
6668 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
6669 ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload
6670 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3]
6671 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6672 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
6673 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
6674 ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload
6675 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3]
6676 ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
6677 ; AVX2-NEXT: # xmm1 = mem[0,1],xmm1[2,3]
6678 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm3[4,5,6,7]
6679 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6680 ; AVX2-NEXT: vmovaps %ymm1, 192(%rsi)
6681 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6682 ; AVX2-NEXT: vmovaps %ymm1, 128(%rsi)
6683 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6684 ; AVX2-NEXT: vmovaps %ymm1, 64(%rsi)
6685 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6686 ; AVX2-NEXT: vmovaps %ymm1, (%rsi)
6687 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6688 ; AVX2-NEXT: vmovaps %ymm1, 224(%rsi)
6689 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6690 ; AVX2-NEXT: vmovaps %ymm1, 160(%rsi)
6691 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6692 ; AVX2-NEXT: vmovaps %ymm1, 96(%rsi)
6693 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6694 ; AVX2-NEXT: vmovaps %ymm1, 32(%rsi)
6695 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6696 ; AVX2-NEXT: vmovaps %ymm1, 192(%rdx)
6697 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6698 ; AVX2-NEXT: vmovaps %ymm1, 128(%rdx)
6699 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6700 ; AVX2-NEXT: vmovaps %ymm1, 64(%rdx)
6701 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6702 ; AVX2-NEXT: vmovaps %ymm1, (%rdx)
6703 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6704 ; AVX2-NEXT: vmovaps %ymm1, 224(%rdx)
6705 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6706 ; AVX2-NEXT: vmovaps %ymm1, 160(%rdx)
6707 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6708 ; AVX2-NEXT: vmovaps %ymm1, 96(%rdx)
6709 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6710 ; AVX2-NEXT: vmovaps %ymm1, 32(%rdx)
6711 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6712 ; AVX2-NEXT: vmovaps %ymm1, 192(%rcx)
6713 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6714 ; AVX2-NEXT: vmovaps %ymm1, 128(%rcx)
6715 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6716 ; AVX2-NEXT: vmovaps %ymm1, 64(%rcx)
6717 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6718 ; AVX2-NEXT: vmovaps %ymm1, (%rcx)
6719 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6720 ; AVX2-NEXT: vmovaps %ymm1, 224(%rcx)
6721 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6722 ; AVX2-NEXT: vmovaps %ymm1, 160(%rcx)
6723 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6724 ; AVX2-NEXT: vmovaps %ymm1, 96(%rcx)
6725 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6726 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
6727 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6728 ; AVX2-NEXT: vmovaps %ymm1, (%r8)
6729 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6730 ; AVX2-NEXT: vmovaps %ymm1, 64(%r8)
6731 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6732 ; AVX2-NEXT: vmovaps %ymm1, 128(%r8)
6733 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6734 ; AVX2-NEXT: vmovaps %ymm1, 192(%r8)
6735 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6736 ; AVX2-NEXT: vmovaps %ymm1, 224(%r8)
6737 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6738 ; AVX2-NEXT: vmovaps %ymm1, 160(%r8)
6739 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6740 ; AVX2-NEXT: vmovaps %ymm1, 96(%r8)
6741 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6742 ; AVX2-NEXT: vmovaps %ymm1, 32(%r8)
6743 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6744 ; AVX2-NEXT: vmovaps %ymm1, 224(%r9)
6745 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6746 ; AVX2-NEXT: vmovaps %ymm1, 192(%r9)
6747 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6748 ; AVX2-NEXT: vmovaps %ymm1, 160(%r9)
6749 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6750 ; AVX2-NEXT: vmovaps %ymm1, 128(%r9)
6751 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6752 ; AVX2-NEXT: vmovaps %ymm1, 96(%r9)
6753 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6754 ; AVX2-NEXT: vmovaps %ymm1, 64(%r9)
6755 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6756 ; AVX2-NEXT: vmovaps %ymm1, 32(%r9)
6757 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6758 ; AVX2-NEXT: vmovaps %ymm1, (%r9)
6759 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
6760 ; AVX2-NEXT: vmovdqa %ymm7, 224(%rax)
6761 ; AVX2-NEXT: vmovdqa %ymm15, 192(%rax)
6762 ; AVX2-NEXT: vmovdqa %ymm14, 160(%rax)
6763 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6764 ; AVX2-NEXT: vmovaps %ymm1, 128(%rax)
6765 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6766 ; AVX2-NEXT: vmovaps %ymm1, 96(%rax)
6767 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6768 ; AVX2-NEXT: vmovaps %ymm1, 64(%rax)
6769 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6770 ; AVX2-NEXT: vmovaps %ymm1, 32(%rax)
6771 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6772 ; AVX2-NEXT: vmovaps %ymm1, (%rax)
6773 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
6774 ; AVX2-NEXT: vmovdqa %ymm10, 224(%rax)
6775 ; AVX2-NEXT: vmovdqa %ymm0, 192(%rax)
6776 ; AVX2-NEXT: vmovdqa %ymm4, 160(%rax)
6777 ; AVX2-NEXT: vmovdqa %ymm2, 128(%rax)
6778 ; AVX2-NEXT: vmovdqa %ymm11, 96(%rax)
6779 ; AVX2-NEXT: vmovdqa %ymm12, 64(%rax)
6780 ; AVX2-NEXT: vmovdqa %ymm6, 32(%rax)
6781 ; AVX2-NEXT: vmovdqa %ymm9, (%rax)
6782 ; AVX2-NEXT: addq $1576, %rsp # imm = 0x628
6783 ; AVX2-NEXT: vzeroupper
6786 ; AVX2-FP-LABEL: load_i64_stride7_vf32:
6788 ; AVX2-FP-NEXT: subq $1576, %rsp # imm = 0x628
6789 ; AVX2-FP-NEXT: vmovdqa 1216(%rdi), %ymm1
6790 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6791 ; AVX2-FP-NEXT: vmovdqa 768(%rdi), %ymm6
6792 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6793 ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm2
6794 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %xmm0
6795 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6796 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
6797 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7]
6798 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %xmm4
6799 ; AVX2-FP-NEXT: vmovdqa 272(%rdi), %xmm0
6800 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6801 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm4[0,1],xmm0[2,3]
6802 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7]
6803 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6804 ; AVX2-FP-NEXT: vmovdqa 832(%rdi), %xmm0
6805 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6806 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
6807 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
6808 ; AVX2-FP-NEXT: vmovdqa 672(%rdi), %xmm5
6809 ; AVX2-FP-NEXT: vmovdqa 720(%rdi), %xmm0
6810 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6811 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1],xmm0[2,3]
6812 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7]
6813 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6814 ; AVX2-FP-NEXT: vmovdqa 1280(%rdi), %xmm0
6815 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6816 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
6817 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm3[6,7]
6818 ; AVX2-FP-NEXT: vmovdqa 1120(%rdi), %xmm6
6819 ; AVX2-FP-NEXT: vmovdqa 1168(%rdi), %xmm0
6820 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6821 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm0[2,3]
6822 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm3[4,5,6,7]
6823 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6824 ; AVX2-FP-NEXT: vmovdqa 1664(%rdi), %ymm3
6825 ; AVX2-FP-NEXT: vmovdqa 1728(%rdi), %xmm0
6826 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6827 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7
6828 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3,4,5],ymm7[6,7]
6829 ; AVX2-FP-NEXT: vmovdqa 1568(%rdi), %xmm8
6830 ; AVX2-FP-NEXT: vmovdqa 1616(%rdi), %xmm0
6831 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6832 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm8[0,1],xmm0[2,3]
6833 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm7[4,5,6,7]
6834 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6835 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %ymm7
6836 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm0
6837 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6838 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9
6839 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],ymm9[6,7]
6840 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm0
6841 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6842 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = mem[0,1],xmm0[2,3]
6843 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7]
6844 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6845 ; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm9
6846 ; AVX2-FP-NEXT: vmovdqa 608(%rdi), %xmm0
6847 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6848 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10
6849 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5],ymm10[6,7]
6850 ; AVX2-FP-NEXT: vmovdqa 448(%rdi), %xmm10
6851 ; AVX2-FP-NEXT: vmovdqa 496(%rdi), %xmm0
6852 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6853 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm10[0,1],xmm0[2,3]
6854 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm11[4,5,6,7]
6855 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6856 ; AVX2-FP-NEXT: vmovdqa 992(%rdi), %ymm11
6857 ; AVX2-FP-NEXT: vmovdqa 1056(%rdi), %xmm0
6858 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6859 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12
6860 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5],ymm12[6,7]
6861 ; AVX2-FP-NEXT: vmovdqa 896(%rdi), %xmm12
6862 ; AVX2-FP-NEXT: vmovdqa 944(%rdi), %xmm0
6863 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6864 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm12[0,1],xmm0[2,3]
6865 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm13[4,5,6,7]
6866 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6867 ; AVX2-FP-NEXT: vmovdqa 1440(%rdi), %ymm13
6868 ; AVX2-FP-NEXT: vmovdqa 1504(%rdi), %xmm0
6869 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6870 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm14
6871 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5],ymm14[6,7]
6872 ; AVX2-FP-NEXT: vmovdqa 1344(%rdi), %xmm14
6873 ; AVX2-FP-NEXT: vmovdqa 1392(%rdi), %xmm0
6874 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6875 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3]
6876 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
6877 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6878 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm0
6879 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
6880 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm4
6881 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
6882 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
6883 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6884 ; AVX2-FP-NEXT: vmovdqa 736(%rdi), %xmm0
6885 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
6886 ; AVX2-FP-NEXT: vmovdqa 832(%rdi), %ymm2
6887 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload
6888 ; AVX2-FP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
6889 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6890 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6891 ; AVX2-FP-NEXT: vmovdqa 1184(%rdi), %xmm0
6892 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
6893 ; AVX2-FP-NEXT: vmovdqa 1280(%rdi), %ymm5
6894 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload
6895 ; AVX2-FP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
6896 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6897 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6898 ; AVX2-FP-NEXT: vmovdqa 1632(%rdi), %xmm0
6899 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
6900 ; AVX2-FP-NEXT: vmovdqa 1728(%rdi), %ymm8
6901 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
6902 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6903 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6904 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm0
6905 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
6906 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm3
6907 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
6908 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
6909 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6910 ; AVX2-FP-NEXT: vmovdqa 608(%rdi), %ymm1
6911 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm6 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
6912 ; AVX2-FP-NEXT: vmovdqa 512(%rdi), %xmm7
6913 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
6914 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
6915 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6916 ; AVX2-FP-NEXT: vmovdqa 1056(%rdi), %ymm6
6917 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm9 = ymm11[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
6918 ; AVX2-FP-NEXT: vmovdqa 960(%rdi), %xmm10
6919 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm11 = xmm12[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
6920 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
6921 ; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6922 ; AVX2-FP-NEXT: vmovdqa 1504(%rdi), %ymm9
6923 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
6924 ; AVX2-FP-NEXT: vmovdqa 1408(%rdi), %xmm11
6925 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm13 = xmm14[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
6926 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
6927 ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6928 ; AVX2-FP-NEXT: vpbroadcastq 352(%rdi), %ymm12
6929 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm12[1],ymm4[1],ymm12[3],ymm4[3]
6930 ; AVX2-FP-NEXT: vmovdqa 240(%rdi), %xmm12
6931 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3]
6932 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7]
6933 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6934 ; AVX2-FP-NEXT: vpbroadcastq 800(%rdi), %ymm4
6935 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3]
6936 ; AVX2-FP-NEXT: vmovdqa 688(%rdi), %xmm4
6937 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
6938 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
6939 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6940 ; AVX2-FP-NEXT: vpbroadcastq 1248(%rdi), %ymm2
6941 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3]
6942 ; AVX2-FP-NEXT: vmovdqa 1136(%rdi), %xmm4
6943 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
6944 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
6945 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6946 ; AVX2-FP-NEXT: vpbroadcastq 1696(%rdi), %ymm2
6947 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3]
6948 ; AVX2-FP-NEXT: vmovdqa 1584(%rdi), %xmm4
6949 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
6950 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
6951 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6952 ; AVX2-FP-NEXT: vpbroadcastq 128(%rdi), %ymm2
6953 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
6954 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm3[2,3]
6955 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6956 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6957 ; AVX2-FP-NEXT: vpbroadcastq 576(%rdi), %ymm0
6958 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6959 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm7[2,3]
6960 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6961 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6962 ; AVX2-FP-NEXT: vpbroadcastq 1024(%rdi), %ymm0
6963 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3]
6964 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm10[2,3]
6965 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6966 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6967 ; AVX2-FP-NEXT: vpbroadcastq 1472(%rdi), %ymm0
6968 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3]
6969 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm11[2,3]
6970 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6971 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6972 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm0
6973 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
6974 ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %xmm0
6975 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
6976 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6977 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
6978 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6979 ; AVX2-FP-NEXT: vmovdqa 736(%rdi), %ymm1
6980 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
6981 ; AVX2-FP-NEXT: vmovdqa 864(%rdi), %xmm3
6982 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
6983 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6984 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
6985 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6986 ; AVX2-FP-NEXT: vmovdqa 1184(%rdi), %ymm1
6987 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
6988 ; AVX2-FP-NEXT: vmovdqa 1312(%rdi), %xmm7
6989 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
6990 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6991 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
6992 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6993 ; AVX2-FP-NEXT: vmovdqa 1632(%rdi), %ymm1
6994 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
6995 ; AVX2-FP-NEXT: vmovdqa 1760(%rdi), %xmm6
6996 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
6997 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6998 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
6999 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7000 ; AVX2-FP-NEXT: vmovdqa 1408(%rdi), %ymm1
7001 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
7002 ; AVX2-FP-NEXT: vmovdqa 1536(%rdi), %xmm5
7003 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
7004 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
7005 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
7006 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7007 ; AVX2-FP-NEXT: vmovdqa 960(%rdi), %ymm1
7008 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
7009 ; AVX2-FP-NEXT: vmovdqa 1088(%rdi), %xmm4
7010 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
7011 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
7012 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
7013 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7014 ; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm1
7015 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
7016 ; AVX2-FP-NEXT: vmovdqa 640(%rdi), %xmm8
7017 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
7018 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
7019 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
7020 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7021 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm1
7022 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
7023 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm2
7024 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
7025 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
7026 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm9[2,3]
7027 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7028 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1
7029 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm12
7030 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7]
7031 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2
7032 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
7033 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
7034 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7035 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7036 ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm14
7037 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
7038 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm2
7039 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],mem[2,3]
7040 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7041 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7042 ; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0
7043 ; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm1
7044 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7045 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
7046 ; AVX2-FP-NEXT: vmovdqa 480(%rdi), %xmm11
7047 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],mem[2,3]
7048 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7049 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7050 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
7051 ; AVX2-FP-NEXT: vmovdqa 800(%rdi), %ymm10
7052 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7]
7053 ; AVX2-FP-NEXT: vmovdqa 704(%rdi), %xmm1
7054 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],mem[2,3]
7055 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
7056 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7057 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
7058 ; AVX2-FP-NEXT: vmovdqa 1024(%rdi), %ymm3
7059 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7060 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
7061 ; AVX2-FP-NEXT: vmovdqa 928(%rdi), %xmm4
7062 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],mem[2,3]
7063 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
7064 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7065 ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0
7066 ; AVX2-FP-NEXT: vmovdqa 1248(%rdi), %ymm8
7067 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm0[6,7]
7068 ; AVX2-FP-NEXT: vmovdqa 1152(%rdi), %xmm0
7069 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],mem[2,3]
7070 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7]
7071 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7072 ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3
7073 ; AVX2-FP-NEXT: vmovdqa 1472(%rdi), %ymm15
7074 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm3[6,7]
7075 ; AVX2-FP-NEXT: vmovdqa 1376(%rdi), %xmm3
7076 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0,1],mem[2,3]
7077 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
7078 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7079 ; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5
7080 ; AVX2-FP-NEXT: vmovdqa 1696(%rdi), %ymm7
7081 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7]
7082 ; AVX2-FP-NEXT: vmovdqa 1600(%rdi), %xmm9
7083 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],mem[2,3]
7084 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
7085 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7086 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm13
7087 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm5 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
7088 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm6
7089 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
7090 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
7091 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7092 ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm12
7093 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm5 = ymm14[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
7094 ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm6
7095 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
7096 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
7097 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7098 ; AVX2-FP-NEXT: vmovdqa 544(%rdi), %xmm2
7099 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm11[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
7100 ; AVX2-FP-NEXT: vmovdqa 640(%rdi), %ymm5
7101 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload
7102 ; AVX2-FP-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
7103 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7]
7104 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7105 ; AVX2-FP-NEXT: vmovdqa 864(%rdi), %ymm11
7106 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
7107 ; AVX2-FP-NEXT: vmovdqa 768(%rdi), %xmm10
7108 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
7109 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
7110 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7111 ; AVX2-FP-NEXT: vmovdqa 992(%rdi), %xmm1
7112 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
7113 ; AVX2-FP-NEXT: vmovdqa 1088(%rdi), %ymm2
7114 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload
7115 ; AVX2-FP-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
7116 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
7117 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7118 ; AVX2-FP-NEXT: vmovdqa 1312(%rdi), %ymm4
7119 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
7120 ; AVX2-FP-NEXT: vmovdqa 1216(%rdi), %xmm8
7121 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
7122 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7123 ; AVX2-FP-NEXT: vmovdqa 1440(%rdi), %xmm0
7124 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
7125 ; AVX2-FP-NEXT: vmovdqa 1536(%rdi), %ymm0
7126 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
7127 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm3[4,5,6,7]
7128 ; AVX2-FP-NEXT: vmovdqa 1760(%rdi), %ymm3
7129 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
7130 ; AVX2-FP-NEXT: vmovdqa 1664(%rdi), %xmm1
7131 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
7132 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
7133 ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload
7134 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm13[1],ymm9[3],ymm13[3]
7135 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
7136 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
7137 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
7138 ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload
7139 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
7140 ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
7141 ; AVX2-FP-NEXT: # xmm6 = mem[0,1],xmm6[2,3]
7142 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
7143 ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 16-byte Folded Reload
7144 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm5[1],ymm12[3],ymm5[3]
7145 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
7146 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3]
7147 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm5[4,5,6,7]
7148 ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload
7149 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm11[1],ymm5[3],ymm11[3]
7150 ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
7151 ; AVX2-FP-NEXT: # xmm10 = mem[0,1],xmm10[2,3]
7152 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm5[4,5,6,7]
7153 ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload
7154 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3]
7155 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7156 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
7157 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
7158 ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload
7159 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
7160 ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload
7161 ; AVX2-FP-NEXT: # xmm5 = mem[0,1],xmm8[2,3]
7162 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
7163 ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload
7164 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3]
7165 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7166 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
7167 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
7168 ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload
7169 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3]
7170 ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
7171 ; AVX2-FP-NEXT: # xmm1 = mem[0,1],xmm1[2,3]
7172 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm3[4,5,6,7]
7173 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7174 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rsi)
7175 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7176 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rsi)
7177 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7178 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rsi)
7179 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7180 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi)
7181 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7182 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rsi)
7183 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7184 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rsi)
7185 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7186 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rsi)
7187 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7188 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rsi)
7189 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7190 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rdx)
7191 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7192 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rdx)
7193 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7194 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rdx)
7195 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7196 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx)
7197 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7198 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rdx)
7199 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7200 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rdx)
7201 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7202 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rdx)
7203 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7204 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rdx)
7205 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7206 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rcx)
7207 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7208 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rcx)
7209 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7210 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rcx)
7211 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7212 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rcx)
7213 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7214 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rcx)
7215 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7216 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rcx)
7217 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7218 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rcx)
7219 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7220 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rcx)
7221 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7222 ; AVX2-FP-NEXT: vmovaps %ymm1, (%r8)
7223 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7224 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%r8)
7225 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7226 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%r8)
7227 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7228 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%r8)
7229 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7230 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%r8)
7231 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7232 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%r8)
7233 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7234 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%r8)
7235 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7236 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r8)
7237 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7238 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%r9)
7239 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7240 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%r9)
7241 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7242 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%r9)
7243 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7244 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%r9)
7245 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7246 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%r9)
7247 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7248 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%r9)
7249 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7250 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r9)
7251 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7252 ; AVX2-FP-NEXT: vmovaps %ymm1, (%r9)
7253 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7254 ; AVX2-FP-NEXT: vmovdqa %ymm7, 224(%rax)
7255 ; AVX2-FP-NEXT: vmovdqa %ymm15, 192(%rax)
7256 ; AVX2-FP-NEXT: vmovdqa %ymm14, 160(%rax)
7257 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7258 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rax)
7259 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7260 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rax)
7261 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7262 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rax)
7263 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7264 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rax)
7265 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7266 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rax)
7267 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7268 ; AVX2-FP-NEXT: vmovdqa %ymm10, 224(%rax)
7269 ; AVX2-FP-NEXT: vmovdqa %ymm0, 192(%rax)
7270 ; AVX2-FP-NEXT: vmovdqa %ymm4, 160(%rax)
7271 ; AVX2-FP-NEXT: vmovdqa %ymm2, 128(%rax)
7272 ; AVX2-FP-NEXT: vmovdqa %ymm11, 96(%rax)
7273 ; AVX2-FP-NEXT: vmovdqa %ymm12, 64(%rax)
7274 ; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%rax)
7275 ; AVX2-FP-NEXT: vmovdqa %ymm9, (%rax)
7276 ; AVX2-FP-NEXT: addq $1576, %rsp # imm = 0x628
7277 ; AVX2-FP-NEXT: vzeroupper
7278 ; AVX2-FP-NEXT: retq
7280 ; AVX2-FCP-LABEL: load_i64_stride7_vf32:
7281 ; AVX2-FCP: # %bb.0:
7282 ; AVX2-FCP-NEXT: subq $1576, %rsp # imm = 0x628
7283 ; AVX2-FCP-NEXT: vmovdqa 1216(%rdi), %ymm1
7284 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7285 ; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm6
7286 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7287 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm2
7288 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %xmm0
7289 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7290 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
7291 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7]
7292 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %xmm4
7293 ; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm0
7294 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7295 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm4[0,1],xmm0[2,3]
7296 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm3[4,5,6,7]
7297 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7298 ; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %xmm0
7299 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7300 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
7301 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
7302 ; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %xmm5
7303 ; AVX2-FCP-NEXT: vmovdqa 720(%rdi), %xmm0
7304 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7305 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm5[0,1],xmm0[2,3]
7306 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7]
7307 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7308 ; AVX2-FCP-NEXT: vmovdqa 1280(%rdi), %xmm0
7309 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7310 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
7311 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm3[6,7]
7312 ; AVX2-FCP-NEXT: vmovdqa 1120(%rdi), %xmm6
7313 ; AVX2-FCP-NEXT: vmovdqa 1168(%rdi), %xmm0
7314 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7315 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm6[0,1],xmm0[2,3]
7316 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm3[4,5,6,7]
7317 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7318 ; AVX2-FCP-NEXT: vmovdqa 1664(%rdi), %ymm3
7319 ; AVX2-FCP-NEXT: vmovdqa 1728(%rdi), %xmm0
7320 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7321 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm7
7322 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm3[0,1,2,3,4,5],ymm7[6,7]
7323 ; AVX2-FCP-NEXT: vmovdqa 1568(%rdi), %xmm8
7324 ; AVX2-FCP-NEXT: vmovdqa 1616(%rdi), %xmm0
7325 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7326 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm8[0,1],xmm0[2,3]
7327 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm7[4,5,6,7]
7328 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7329 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm7
7330 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm0
7331 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7332 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm9
7333 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5],ymm9[6,7]
7334 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm0
7335 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7336 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = mem[0,1],xmm0[2,3]
7337 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7]
7338 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7339 ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm9
7340 ; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %xmm0
7341 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7342 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm10
7343 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5],ymm10[6,7]
7344 ; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %xmm10
7345 ; AVX2-FCP-NEXT: vmovdqa 496(%rdi), %xmm0
7346 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7347 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm10[0,1],xmm0[2,3]
7348 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm11[4,5,6,7]
7349 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7350 ; AVX2-FCP-NEXT: vmovdqa 992(%rdi), %ymm11
7351 ; AVX2-FCP-NEXT: vmovdqa 1056(%rdi), %xmm0
7352 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7353 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm12
7354 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5],ymm12[6,7]
7355 ; AVX2-FCP-NEXT: vmovdqa 896(%rdi), %xmm12
7356 ; AVX2-FCP-NEXT: vmovdqa 944(%rdi), %xmm0
7357 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7358 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm12[0,1],xmm0[2,3]
7359 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm13[4,5,6,7]
7360 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7361 ; AVX2-FCP-NEXT: vmovdqa 1440(%rdi), %ymm13
7362 ; AVX2-FCP-NEXT: vmovdqa 1504(%rdi), %xmm0
7363 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7364 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm14
7365 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3,4,5],ymm14[6,7]
7366 ; AVX2-FCP-NEXT: vmovdqa 1344(%rdi), %xmm14
7367 ; AVX2-FCP-NEXT: vmovdqa 1392(%rdi), %xmm0
7368 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7369 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3]
7370 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm15[4,5,6,7]
7371 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7372 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm0
7373 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm4[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
7374 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm4
7375 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm2[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
7376 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
7377 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7378 ; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %xmm0
7379 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm5[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
7380 ; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %ymm2
7381 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload
7382 ; AVX2-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
7383 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7384 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7385 ; AVX2-FCP-NEXT: vmovdqa 1184(%rdi), %xmm0
7386 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm6[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
7387 ; AVX2-FCP-NEXT: vmovdqa 1280(%rdi), %ymm5
7388 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm1 # 32-byte Folded Reload
7389 ; AVX2-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
7390 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7391 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7392 ; AVX2-FCP-NEXT: vmovdqa 1632(%rdi), %xmm0
7393 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
7394 ; AVX2-FCP-NEXT: vmovdqa 1728(%rdi), %ymm8
7395 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
7396 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7397 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7398 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm0
7399 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
7400 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm3
7401 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
7402 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
7403 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7404 ; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm1
7405 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm6 = ymm9[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
7406 ; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %xmm7
7407 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm9 = xmm10[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
7408 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
7409 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7410 ; AVX2-FCP-NEXT: vmovdqa 1056(%rdi), %ymm6
7411 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm9 = ymm11[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
7412 ; AVX2-FCP-NEXT: vmovdqa 960(%rdi), %xmm10
7413 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm11 = xmm12[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
7414 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
7415 ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7416 ; AVX2-FCP-NEXT: vmovdqa 1504(%rdi), %ymm9
7417 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm13[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
7418 ; AVX2-FCP-NEXT: vmovdqa 1408(%rdi), %xmm11
7419 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm13 = xmm14[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
7420 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
7421 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7422 ; AVX2-FCP-NEXT: vpbroadcastq 352(%rdi), %ymm12
7423 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm12[1],ymm4[1],ymm12[3],ymm4[3]
7424 ; AVX2-FCP-NEXT: vmovdqa 240(%rdi), %xmm12
7425 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3]
7426 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7]
7427 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7428 ; AVX2-FCP-NEXT: vpbroadcastq 800(%rdi), %ymm4
7429 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm4[1],ymm2[1],ymm4[3],ymm2[3]
7430 ; AVX2-FCP-NEXT: vmovdqa 688(%rdi), %xmm4
7431 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
7432 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
7433 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7434 ; AVX2-FCP-NEXT: vpbroadcastq 1248(%rdi), %ymm2
7435 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm5[1],ymm2[3],ymm5[3]
7436 ; AVX2-FCP-NEXT: vmovdqa 1136(%rdi), %xmm4
7437 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
7438 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
7439 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7440 ; AVX2-FCP-NEXT: vpbroadcastq 1696(%rdi), %ymm2
7441 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3]
7442 ; AVX2-FCP-NEXT: vmovdqa 1584(%rdi), %xmm4
7443 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
7444 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
7445 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7446 ; AVX2-FCP-NEXT: vpbroadcastq 128(%rdi), %ymm2
7447 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
7448 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = mem[0,1],xmm3[2,3]
7449 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
7450 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7451 ; AVX2-FCP-NEXT: vpbroadcastq 576(%rdi), %ymm0
7452 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
7453 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm7[2,3]
7454 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7455 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7456 ; AVX2-FCP-NEXT: vpbroadcastq 1024(%rdi), %ymm0
7457 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3]
7458 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm10[2,3]
7459 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7460 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7461 ; AVX2-FCP-NEXT: vpbroadcastq 1472(%rdi), %ymm0
7462 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3]
7463 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm11[2,3]
7464 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7465 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7466 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm0
7467 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
7468 ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %xmm0
7469 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
7470 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
7471 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
7472 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7473 ; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %ymm1
7474 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
7475 ; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %xmm3
7476 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
7477 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
7478 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
7479 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7480 ; AVX2-FCP-NEXT: vmovdqa 1184(%rdi), %ymm1
7481 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
7482 ; AVX2-FCP-NEXT: vmovdqa 1312(%rdi), %xmm7
7483 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
7484 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
7485 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
7486 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7487 ; AVX2-FCP-NEXT: vmovdqa 1632(%rdi), %ymm1
7488 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
7489 ; AVX2-FCP-NEXT: vmovdqa 1760(%rdi), %xmm6
7490 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
7491 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
7492 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
7493 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7494 ; AVX2-FCP-NEXT: vmovdqa 1408(%rdi), %ymm1
7495 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
7496 ; AVX2-FCP-NEXT: vmovdqa 1536(%rdi), %xmm5
7497 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
7498 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
7499 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
7500 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7501 ; AVX2-FCP-NEXT: vmovdqa 960(%rdi), %ymm1
7502 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
7503 ; AVX2-FCP-NEXT: vmovdqa 1088(%rdi), %xmm4
7504 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
7505 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
7506 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
7507 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7508 ; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm1
7509 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
7510 ; AVX2-FCP-NEXT: vmovdqa 640(%rdi), %xmm8
7511 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
7512 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
7513 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm2[2,3]
7514 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7515 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm1
7516 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
7517 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm2
7518 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm9 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
7519 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
7520 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm9[2,3]
7521 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7522 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1
7523 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm12
7524 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7]
7525 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
7526 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
7527 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
7528 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7529 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
7530 ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm14
7531 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
7532 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm2
7533 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],mem[2,3]
7534 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7535 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7536 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm0
7537 ; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm1
7538 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7539 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
7540 ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %xmm11
7541 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],mem[2,3]
7542 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7543 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7544 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
7545 ; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %ymm10
7546 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7]
7547 ; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %xmm1
7548 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],mem[2,3]
7549 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
7550 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7551 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
7552 ; AVX2-FCP-NEXT: vmovdqa 1024(%rdi), %ymm3
7553 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7554 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
7555 ; AVX2-FCP-NEXT: vmovdqa 928(%rdi), %xmm4
7556 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],mem[2,3]
7557 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
7558 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7559 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm0
7560 ; AVX2-FCP-NEXT: vmovdqa 1248(%rdi), %ymm8
7561 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm0[6,7]
7562 ; AVX2-FCP-NEXT: vmovdqa 1152(%rdi), %xmm0
7563 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],mem[2,3]
7564 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm3[4,5,6,7]
7565 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7566 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3
7567 ; AVX2-FCP-NEXT: vmovdqa 1472(%rdi), %ymm15
7568 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4,5],ymm3[6,7]
7569 ; AVX2-FCP-NEXT: vmovdqa 1376(%rdi), %xmm3
7570 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm3[0,1],mem[2,3]
7571 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3],ymm5[4,5,6,7]
7572 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7573 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm5
7574 ; AVX2-FCP-NEXT: vmovdqa 1696(%rdi), %ymm7
7575 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7]
7576 ; AVX2-FCP-NEXT: vmovdqa 1600(%rdi), %xmm9
7577 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1],mem[2,3]
7578 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
7579 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7580 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm13
7581 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
7582 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm6
7583 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm6 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
7584 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
7585 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7586 ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm12
7587 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm14[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
7588 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm6
7589 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
7590 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
7591 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7592 ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %xmm2
7593 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm11[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
7594 ; AVX2-FCP-NEXT: vmovdqa 640(%rdi), %ymm5
7595 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm11 # 32-byte Folded Reload
7596 ; AVX2-FCP-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
7597 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm11[4,5,6,7]
7598 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7599 ; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %ymm11
7600 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm2 = ymm10[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
7601 ; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %xmm10
7602 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
7603 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
7604 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7605 ; AVX2-FCP-NEXT: vmovdqa 992(%rdi), %xmm1
7606 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
7607 ; AVX2-FCP-NEXT: vmovdqa 1088(%rdi), %ymm2
7608 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm4 # 32-byte Folded Reload
7609 ; AVX2-FCP-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
7610 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
7611 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7612 ; AVX2-FCP-NEXT: vmovdqa 1312(%rdi), %ymm4
7613 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
7614 ; AVX2-FCP-NEXT: vmovdqa 1216(%rdi), %xmm8
7615 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
7616 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7617 ; AVX2-FCP-NEXT: vmovdqa 1440(%rdi), %xmm0
7618 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm3[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
7619 ; AVX2-FCP-NEXT: vmovdqa 1536(%rdi), %ymm0
7620 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
7621 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm3[4,5,6,7]
7622 ; AVX2-FCP-NEXT: vmovdqa 1760(%rdi), %ymm3
7623 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm7 = ymm7[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
7624 ; AVX2-FCP-NEXT: vmovdqa 1664(%rdi), %xmm1
7625 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
7626 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
7627 ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 16-byte Folded Reload
7628 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm9 = ymm9[1],ymm13[1],ymm9[3],ymm13[3]
7629 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
7630 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
7631 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3],ymm9[4,5,6,7]
7632 ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 16-byte Folded Reload
7633 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm12 = ymm13[1],ymm12[1],ymm13[3],ymm12[3]
7634 ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
7635 ; AVX2-FCP-NEXT: # xmm6 = mem[0,1],xmm6[2,3]
7636 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm12[4,5,6,7]
7637 ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 16-byte Folded Reload
7638 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm12[1],ymm5[1],ymm12[3],ymm5[3]
7639 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
7640 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3]
7641 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm5[4,5,6,7]
7642 ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload
7643 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm5 = ymm5[1],ymm11[1],ymm5[3],ymm11[3]
7644 ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
7645 ; AVX2-FCP-NEXT: # xmm10 = mem[0,1],xmm10[2,3]
7646 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm5[4,5,6,7]
7647 ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload
7648 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm5[1],ymm2[1],ymm5[3],ymm2[3]
7649 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7650 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
7651 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
7652 ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload
7653 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm5[1],ymm4[1],ymm5[3],ymm4[3]
7654 ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload
7655 ; AVX2-FCP-NEXT: # xmm5 = mem[0,1],xmm8[2,3]
7656 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
7657 ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload
7658 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm5[1],ymm0[1],ymm5[3],ymm0[3]
7659 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7660 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
7661 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
7662 ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 16-byte Folded Reload
7663 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm3 = ymm5[1],ymm3[1],ymm5[3],ymm3[3]
7664 ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
7665 ; AVX2-FCP-NEXT: # xmm1 = mem[0,1],xmm1[2,3]
7666 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm3[4,5,6,7]
7667 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7668 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rsi)
7669 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7670 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rsi)
7671 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7672 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rsi)
7673 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7674 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi)
7675 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7676 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rsi)
7677 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7678 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rsi)
7679 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7680 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rsi)
7681 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7682 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rsi)
7683 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7684 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rdx)
7685 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7686 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rdx)
7687 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7688 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rdx)
7689 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7690 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx)
7691 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7692 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rdx)
7693 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7694 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rdx)
7695 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7696 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rdx)
7697 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7698 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rdx)
7699 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7700 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rcx)
7701 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7702 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rcx)
7703 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7704 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rcx)
7705 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7706 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rcx)
7707 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7708 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rcx)
7709 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7710 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rcx)
7711 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7712 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rcx)
7713 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7714 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rcx)
7715 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7716 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%r8)
7717 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7718 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%r8)
7719 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7720 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%r8)
7721 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7722 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%r8)
7723 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7724 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%r8)
7725 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7726 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%r8)
7727 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7728 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r8)
7729 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7730 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r8)
7731 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7732 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%r9)
7733 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7734 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%r9)
7735 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7736 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%r9)
7737 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7738 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%r9)
7739 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7740 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r9)
7741 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7742 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%r9)
7743 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7744 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r9)
7745 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7746 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%r9)
7747 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7748 ; AVX2-FCP-NEXT: vmovdqa %ymm7, 224(%rax)
7749 ; AVX2-FCP-NEXT: vmovdqa %ymm15, 192(%rax)
7750 ; AVX2-FCP-NEXT: vmovdqa %ymm14, 160(%rax)
7751 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7752 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rax)
7753 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7754 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rax)
7755 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7756 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rax)
7757 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7758 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rax)
7759 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7760 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rax)
7761 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7762 ; AVX2-FCP-NEXT: vmovdqa %ymm10, 224(%rax)
7763 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 192(%rax)
7764 ; AVX2-FCP-NEXT: vmovdqa %ymm4, 160(%rax)
7765 ; AVX2-FCP-NEXT: vmovdqa %ymm2, 128(%rax)
7766 ; AVX2-FCP-NEXT: vmovdqa %ymm11, 96(%rax)
7767 ; AVX2-FCP-NEXT: vmovdqa %ymm12, 64(%rax)
7768 ; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%rax)
7769 ; AVX2-FCP-NEXT: vmovdqa %ymm9, (%rax)
7770 ; AVX2-FCP-NEXT: addq $1576, %rsp # imm = 0x628
7771 ; AVX2-FCP-NEXT: vzeroupper
7772 ; AVX2-FCP-NEXT: retq
7774 ; AVX512-LABEL: load_i64_stride7_vf32:
7776 ; AVX512-NEXT: subq $2728, %rsp # imm = 0xAA8
7777 ; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm25
7778 ; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm1
7779 ; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm31
7780 ; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm12
7781 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm2
7782 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm20
7783 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm14
7784 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm7
7785 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7786 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm26
7787 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm19
7788 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm23
7789 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm6
7790 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,7,14,0,0,7,14,0]
7791 ; AVX512-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3]
7792 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3
7793 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13
7794 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7795 ; AVX512-NEXT: vpermt2q %zmm23, %zmm30, %zmm3
7796 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7797 ; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7798 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
7799 ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7800 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm6
7801 ; AVX512-NEXT: vpermt2q %zmm26, %zmm8, %zmm6
7802 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7803 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm3
7804 ; AVX512-NEXT: vpermt2q %zmm14, %zmm30, %zmm3
7805 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7806 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7807 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3
7808 ; AVX512-NEXT: vpermt2q %zmm2, %zmm8, %zmm3
7809 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7810 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
7811 ; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7812 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0]
7813 ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
7814 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,4,11,4,11,4,11,4]
7815 ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7816 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3
7817 ; AVX512-NEXT: vpermt2q %zmm2, %zmm15, %zmm3
7818 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7819 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm3
7820 ; AVX512-NEXT: vpermt2q %zmm26, %zmm15, %zmm3
7821 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7822 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
7823 ; AVX512-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7824 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3
7825 ; AVX512-NEXT: vpermt2q %zmm2, %zmm22, %zmm3
7826 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7827 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm3
7828 ; AVX512-NEXT: vpermt2q %zmm26, %zmm22, %zmm3
7829 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7830 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3
7831 ; AVX512-NEXT: vpermt2q %zmm2, %zmm11, %zmm3
7832 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7833 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm3
7834 ; AVX512-NEXT: vpermt2q %zmm26, %zmm11, %zmm3
7835 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7836 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm3
7837 ; AVX512-NEXT: vpermt2q %zmm2, %zmm30, %zmm3
7838 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7839 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm3
7840 ; AVX512-NEXT: vpermt2q %zmm26, %zmm30, %zmm3
7841 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7842 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3
7843 ; AVX512-NEXT: vpermt2q %zmm20, %zmm9, %zmm3
7844 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7845 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9]
7846 ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7847 ; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm20
7848 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm21
7849 ; AVX512-NEXT: vpermt2q %zmm19, %zmm9, %zmm21
7850 ; AVX512-NEXT: vpermt2q %zmm26, %zmm0, %zmm19
7851 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm3
7852 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7853 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm2
7854 ; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7855 ; AVX512-NEXT: vpermt2q %zmm31, %zmm30, %zmm2
7856 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7857 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm12
7858 ; AVX512-NEXT: vpermt2q %zmm25, %zmm8, %zmm12
7859 ; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm28
7860 ; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm2
7861 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5
7862 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7
7863 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7864 ; AVX512-NEXT: vpermt2q %zmm28, %zmm30, %zmm5
7865 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7866 ; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm2
7867 ; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm10
7868 ; AVX512-NEXT: vpermi2q %zmm2, %zmm10, %zmm8
7869 ; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm18
7870 ; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm6
7871 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7872 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5
7873 ; AVX512-NEXT: vpermt2q %zmm18, %zmm11, %zmm6
7874 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7875 ; AVX512-NEXT: vpermt2q %zmm3, %zmm9, %zmm31
7876 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4
7877 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm6
7878 ; AVX512-NEXT: vpermt2q %zmm25, %zmm15, %zmm4
7879 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm17
7880 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm15
7881 ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm4
7882 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7883 ; AVX512-NEXT: vpermt2q %zmm15, %zmm11, %zmm4
7884 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7885 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm26
7886 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload
7887 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm16
7888 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm4
7889 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7890 ; AVX512-NEXT: vpermt2q %zmm16, %zmm11, %zmm4
7891 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7892 ; AVX512-NEXT: vpermt2q %zmm13, %zmm9, %zmm23
7893 ; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm11
7894 ; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm4
7895 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7896 ; AVX512-NEXT: vpermt2q %zmm11, %zmm5, %zmm4
7897 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7898 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm27
7899 ; AVX512-NEXT: vpermt2q %zmm7, %zmm9, %zmm27
7900 ; AVX512-NEXT: vpermi2q %zmm2, %zmm10, %zmm6
7901 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm29
7902 ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm3
7903 ; AVX512-NEXT: vpermt2q %zmm25, %zmm22, %zmm29
7904 ; AVX512-NEXT: vpermi2q %zmm2, %zmm10, %zmm22
7905 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm14
7906 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm22
7907 ; AVX512-NEXT: vpermt2q %zmm25, %zmm5, %zmm22
7908 ; AVX512-NEXT: vpermi2q %zmm2, %zmm10, %zmm5
7909 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13
7910 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm25
7911 ; AVX512-NEXT: vpermt2q %zmm3, %zmm30, %zmm25
7912 ; AVX512-NEXT: vpermi2q %zmm2, %zmm10, %zmm30
7913 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm24
7914 ; AVX512-NEXT: vpermt2q %zmm1, %zmm9, %zmm24
7915 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4
7916 ; AVX512-NEXT: vpermi2q %zmm10, %zmm2, %zmm9
7917 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm7
7918 ; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm10
7919 ; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm4
7920 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm0
7921 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,9,0,5,6,9]
7922 ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
7923 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
7924 ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
7925 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7926 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm3
7927 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
7928 ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
7929 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7930 ; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm5
7931 ; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm12
7932 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7933 ; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm9
7934 ; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm8
7935 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7936 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,10,0,5,6,10]
7937 ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
7938 ; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm17
7939 ; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7940 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
7941 ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
7942 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7943 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
7944 ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
7945 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7946 ; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm6
7947 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7948 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,11,0,5,6,11]
7949 ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
7950 ; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm29
7951 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
7952 ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
7953 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7954 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
7955 ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
7956 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7957 ; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm14
7958 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7959 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,12,0,5,6,12]
7960 ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
7961 ; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm22
7962 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
7963 ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
7964 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7965 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
7966 ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
7967 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7968 ; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm13
7969 ; AVX512-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7970 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,13,4,5,6,13]
7971 ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
7972 ; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm25
7973 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
7974 ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm8
7975 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
7976 ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm14
7977 ; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm30
7978 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,14,4,5,6,14]
7979 ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
7980 ; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm24
7981 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
7982 ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
7983 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7984 ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm21
7985 ; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7986 ; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm7
7987 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7988 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15]
7989 ; AVX512-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
7990 ; AVX512-NEXT: vpermt2q %zmm3, %zmm2, %zmm20
7991 ; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7992 ; AVX512-NEXT: vpermt2q %zmm0, %zmm2, %zmm19
7993 ; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7994 ; AVX512-NEXT: vpermt2q %zmm9, %zmm2, %zmm10
7995 ; AVX512-NEXT: vpermt2q %zmm5, %zmm2, %zmm4
7996 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7997 ; AVX512-NEXT: movb $24, %al
7998 ; AVX512-NEXT: kmovw %eax, %k1
7999 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8000 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8001 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5]
8002 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8003 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
8004 ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
8005 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2
8006 ; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
8007 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [7,0,9,0,7,0,9,0]
8008 ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
8009 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm20
8010 ; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm20
8011 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm21 = [0,0,4,11]
8012 ; AVX512-NEXT: vpermt2q %zmm15, %zmm21, %zmm1
8013 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8014 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8015 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
8016 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm16[4,5,4,5]
8017 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8018 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm15
8019 ; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm15
8020 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm19
8021 ; AVX512-NEXT: vpermt2q %zmm3, %zmm4, %zmm19
8022 ; AVX512-NEXT: vpermt2q %zmm16, %zmm21, %zmm3
8023 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8024 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
8025 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8026 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm11[4,5,4,5]
8027 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8028 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8029 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm16
8030 ; AVX512-NEXT: vpermt2q %zmm18, %zmm0, %zmm16
8031 ; AVX512-NEXT: vpermi2q %zmm11, %zmm3, %zmm0
8032 ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm17
8033 ; AVX512-NEXT: vpermt2q %zmm1, %zmm4, %zmm17
8034 ; AVX512-NEXT: vpermi2q %zmm3, %zmm11, %zmm4
8035 ; AVX512-NEXT: vpermt2q %zmm11, %zmm21, %zmm3
8036 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8037 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
8038 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5]
8039 ; AVX512-NEXT: vpermt2q %zmm18, %zmm21, %zmm1
8040 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8041 ; AVX512-NEXT: vmovdqa 912(%rdi), %xmm11
8042 ; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
8043 ; AVX512-NEXT: vinserti32x4 $0, %xmm11, %zmm16, %zmm1
8044 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8045 ; AVX512-NEXT: vmovdqa 464(%rdi), %xmm11
8046 ; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
8047 ; AVX512-NEXT: vinserti32x4 $0, %xmm11, %zmm2, %zmm1
8048 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8049 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2
8050 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
8051 ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm15, %zmm21
8052 ; AVX512-NEXT: vmovdqa 1360(%rdi), %xmm2
8053 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
8054 ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm16
8055 ; AVX512-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
8056 ; AVX512-NEXT: vmovdqa 1024(%rdi), %ymm7
8057 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm0[6,7]
8058 ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,11]
8059 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
8060 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm12
8061 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
8062 ; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm12
8063 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
8064 ; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm1
8065 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8066 ; AVX512-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm11
8067 ; AVX512-NEXT: vmovdqa 576(%rdi), %ymm1
8068 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7]
8069 ; AVX512-NEXT: vmovdqa64 %ymm1, %ymm25
8070 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
8071 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm15
8072 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
8073 ; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm15
8074 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
8075 ; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm1
8076 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8077 ; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11
8078 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm1
8079 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8080 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7]
8081 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
8082 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1
8083 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
8084 ; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm1
8085 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
8086 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1
8087 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8088 ; AVX512-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1
8089 ; AVX512-NEXT: vmovdqa 1472(%rdi), %ymm11
8090 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7]
8091 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
8092 ; AVX512-NEXT: vpermi2q %zmm28, %zmm8, %zmm0
8093 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
8094 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0
8095 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8096 ; AVX512-NEXT: vmovdqa 1088(%rdi), %ymm0
8097 ; AVX512-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
8098 ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12]
8099 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1
8100 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm7
8101 ; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm7
8102 ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13]
8103 ; AVX512-NEXT: vpermt2q %zmm12, %zmm30, %zmm1
8104 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8105 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm6
8106 ; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm6
8107 ; AVX512-NEXT: vpermt2q %zmm15, %zmm30, %zmm3
8108 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8109 ; AVX512-NEXT: vmovdqa 960(%rdi), %ymm15
8110 ; AVX512-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
8111 ; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm15
8112 ; AVX512-NEXT: vmovdqa 512(%rdi), %ymm12
8113 ; AVX512-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
8114 ; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm12
8115 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm2
8116 ; AVX512-NEXT: vpermt2q %zmm13, %zmm0, %zmm2
8117 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm3
8118 ; AVX512-NEXT: vpermi2q %zmm28, %zmm8, %zmm0
8119 ; AVX512-NEXT: vpermt2q %zmm28, %zmm30, %zmm3
8120 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8121 ; AVX512-NEXT: vpermt2q %zmm13, %zmm30, %zmm5
8122 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8123 ; AVX512-NEXT: movb $-32, %al
8124 ; AVX512-NEXT: kmovw %eax, %k2
8125 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
8126 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8127 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2}
8128 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
8129 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8130 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2}
8131 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm13
8132 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8133 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2}
8134 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
8135 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8136 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2}
8137 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8138 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1}
8139 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8140 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2}
8141 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8142 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1}
8143 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8144 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2}
8145 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8146 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1}
8147 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8148 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2}
8149 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8150 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1}
8151 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8152 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2}
8153 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
8154 ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2}
8155 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8156 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
8157 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2}
8158 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8159 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm21 {%k2}
8160 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8161 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2}
8162 ; AVX512-NEXT: vinserti32x4 $0, %xmm15, %zmm17, %zmm17
8163 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2}
8164 ; AVX512-NEXT: vinserti32x4 $0, %xmm12, %zmm20, %zmm12
8165 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8166 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2}
8167 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7]
8168 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm14
8169 ; AVX512-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
8170 ; AVX512-NEXT: vextracti128 $1, %ymm14, %xmm14
8171 ; AVX512-NEXT: vinserti32x4 $0, %xmm14, %zmm19, %zmm9
8172 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
8173 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2}
8174 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1
8175 ; AVX512-NEXT: vmovdqa 640(%rdi), %ymm14
8176 ; AVX512-NEXT: vmovdqa64 %ymm25, %ymm2
8177 ; AVX512-NEXT: vpalignr {{.*#+}} ymm14 = ymm2[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
8178 ; AVX512-NEXT: vmovdqa 1408(%rdi), %ymm15
8179 ; AVX512-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
8180 ; AVX512-NEXT: vextracti128 $1, %ymm15, %xmm15
8181 ; AVX512-NEXT: vinserti32x4 $0, %xmm15, %zmm4, %zmm4
8182 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
8183 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2}
8184 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm14[4,5,6,7]
8185 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
8186 ; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm3
8187 ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm14
8188 ; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
8189 ; AVX512-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
8190 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm14[4,5,6,7]
8191 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
8192 ; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2
8193 ; AVX512-NEXT: vmovdqa 1536(%rdi), %ymm14
8194 ; AVX512-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
8195 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
8196 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
8197 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
8198 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
8199 ; AVX512-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload
8200 ; AVX512-NEXT: # ymm11 = ymm6[0,1,2,3],mem[4,5,6,7]
8201 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
8202 ; AVX512-NEXT: vinsertf64x4 $0, %ymm11, %zmm6, %zmm11
8203 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
8204 ; AVX512-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
8205 ; AVX512-NEXT: # ymm8 = ymm6[0,1,2,3],mem[4,5,6,7]
8206 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
8207 ; AVX512-NEXT: vinsertf64x4 $0, %ymm8, %zmm6, %zmm8
8208 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
8209 ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
8210 ; AVX512-NEXT: # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7]
8211 ; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm6
8212 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
8213 ; AVX512-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
8214 ; AVX512-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
8215 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
8216 ; AVX512-NEXT: vinsertf64x4 $0, %ymm7, %zmm10, %zmm7
8217 ; AVX512-NEXT: vmovdqa64 %zmm18, 192(%rsi)
8218 ; AVX512-NEXT: vmovdqa64 %zmm13, 128(%rsi)
8219 ; AVX512-NEXT: vmovdqa64 %zmm30, 64(%rsi)
8220 ; AVX512-NEXT: vmovdqa64 %zmm28, (%rsi)
8221 ; AVX512-NEXT: vmovdqa64 %zmm27, 192(%rdx)
8222 ; AVX512-NEXT: vmovdqa64 %zmm23, (%rdx)
8223 ; AVX512-NEXT: vmovdqa64 %zmm26, 64(%rdx)
8224 ; AVX512-NEXT: vmovdqa64 %zmm31, 128(%rdx)
8225 ; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rcx)
8226 ; AVX512-NEXT: vmovdqa64 %zmm21, (%rcx)
8227 ; AVX512-NEXT: vmovdqa64 %zmm29, 64(%rcx)
8228 ; AVX512-NEXT: vmovdqa64 %zmm5, 128(%rcx)
8229 ; AVX512-NEXT: vmovdqa64 %zmm4, 192(%r8)
8230 ; AVX512-NEXT: vmovdqa64 %zmm9, (%r8)
8231 ; AVX512-NEXT: vmovdqa64 %zmm12, 64(%r8)
8232 ; AVX512-NEXT: vmovdqa64 %zmm17, 128(%r8)
8233 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
8234 ; AVX512-NEXT: vmovaps %zmm4, 192(%r9)
8235 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
8236 ; AVX512-NEXT: vmovaps %zmm4, (%r9)
8237 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
8238 ; AVX512-NEXT: vmovaps %zmm4, 64(%r9)
8239 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
8240 ; AVX512-NEXT: vmovaps %zmm4, 128(%r9)
8241 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
8242 ; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rax)
8243 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rax)
8244 ; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rax)
8245 ; AVX512-NEXT: vmovdqa64 %zmm1, 128(%rax)
8246 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
8247 ; AVX512-NEXT: vmovaps %zmm7, 128(%rax)
8248 ; AVX512-NEXT: vmovdqa64 %zmm6, 192(%rax)
8249 ; AVX512-NEXT: vmovaps %zmm8, (%rax)
8250 ; AVX512-NEXT: vmovaps %zmm11, 64(%rax)
8251 ; AVX512-NEXT: addq $2728, %rsp # imm = 0xAA8
8252 ; AVX512-NEXT: vzeroupper
8255 ; AVX512-FCP-LABEL: load_i64_stride7_vf32:
8256 ; AVX512-FCP: # %bb.0:
8257 ; AVX512-FCP-NEXT: subq $2728, %rsp # imm = 0xAA8
8258 ; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm25
8259 ; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1
8260 ; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31
8261 ; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm12
8262 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm2
8263 ; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm20
8264 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14
8265 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
8266 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8267 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26
8268 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm19
8269 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23
8270 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm6
8271 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,7,14,0,0,7,14,0]
8272 ; AVX512-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3]
8273 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3
8274 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
8275 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8276 ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm30, %zmm3
8277 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8278 ; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8279 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
8280 ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8281 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm6
8282 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm6
8283 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8284 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3
8285 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm30, %zmm3
8286 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8287 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8288 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3
8289 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm3
8290 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8291 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
8292 ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8293 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0]
8294 ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
8295 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,4,11,4,11,4,11,4]
8296 ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8297 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3
8298 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm3
8299 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8300 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3
8301 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm15, %zmm3
8302 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8303 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
8304 ; AVX512-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8305 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3
8306 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm3
8307 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8308 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3
8309 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm22, %zmm3
8310 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8311 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3
8312 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3
8313 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8314 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3
8315 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3
8316 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8317 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm3
8318 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm3
8319 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8320 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm3
8321 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm30, %zmm3
8322 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8323 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3
8324 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm9, %zmm3
8325 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8326 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9]
8327 ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8328 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm20
8329 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm21
8330 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm21
8331 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm19
8332 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm3
8333 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8334 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm2
8335 ; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8336 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm30, %zmm2
8337 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8338 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12
8339 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm8, %zmm12
8340 ; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm28
8341 ; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2
8342 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
8343 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7
8344 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8345 ; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm30, %zmm5
8346 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8347 ; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm2
8348 ; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm10
8349 ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm8
8350 ; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm18
8351 ; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6
8352 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8353 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5
8354 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm6
8355 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8356 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm31
8357 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
8358 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm6
8359 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm4
8360 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm17
8361 ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm15
8362 ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4
8363 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8364 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm4
8365 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8366 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm26
8367 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload
8368 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm16
8369 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
8370 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8371 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm4
8372 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8373 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm9, %zmm23
8374 ; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm11
8375 ; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm4
8376 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8377 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm4
8378 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8379 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm27
8380 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm27
8381 ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm6
8382 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm29
8383 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm3
8384 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm22, %zmm29
8385 ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm22
8386 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm14
8387 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm22
8388 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm22
8389 ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm5
8390 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
8391 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm25
8392 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm25
8393 ; AVX512-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm30
8394 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm24
8395 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm24
8396 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
8397 ; AVX512-FCP-NEXT: vpermi2q %zmm10, %zmm2, %zmm9
8398 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm7
8399 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm10
8400 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm4
8401 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0
8402 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,9,0,5,6,9]
8403 ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
8404 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8405 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
8406 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8407 ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3
8408 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8409 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
8410 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8411 ; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm5
8412 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm12
8413 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8414 ; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm9
8415 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm8
8416 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8417 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,10,0,5,6,10]
8418 ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
8419 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm17
8420 ; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8421 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8422 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
8423 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8424 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8425 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
8426 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8427 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm6
8428 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8429 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,11,0,5,6,11]
8430 ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
8431 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm29
8432 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8433 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
8434 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8435 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8436 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
8437 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8438 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm14
8439 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8440 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,12,0,5,6,12]
8441 ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
8442 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm22
8443 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8444 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
8445 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8446 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8447 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
8448 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8449 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm13
8450 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8451 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,13,4,5,6,13]
8452 ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
8453 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm25
8454 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
8455 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm8
8456 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
8457 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm14
8458 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm30
8459 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,14,4,5,6,14]
8460 ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
8461 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm24
8462 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8463 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
8464 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8465 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm21
8466 ; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8467 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm7
8468 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8469 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15]
8470 ; AVX512-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
8471 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm20
8472 ; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8473 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm19
8474 ; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8475 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm10
8476 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4
8477 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8478 ; AVX512-FCP-NEXT: movb $24, %al
8479 ; AVX512-FCP-NEXT: kmovw %eax, %k1
8480 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8481 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8482 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5]
8483 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8484 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
8485 ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
8486 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2
8487 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
8488 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [7,0,9,0,7,0,9,0]
8489 ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
8490 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm20
8491 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm20
8492 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm21 = [0,0,4,11]
8493 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm1
8494 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8495 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8496 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
8497 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm16[4,5,4,5]
8498 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8499 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm15
8500 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm15
8501 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm19
8502 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm19
8503 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm3
8504 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8505 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
8506 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8507 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm11[4,5,4,5]
8508 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8509 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8510 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm16
8511 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm16
8512 ; AVX512-FCP-NEXT: vpermi2q %zmm11, %zmm3, %zmm0
8513 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm17
8514 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm17
8515 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm11, %zmm4
8516 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm3
8517 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8518 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
8519 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5]
8520 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm1
8521 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8522 ; AVX512-FCP-NEXT: vmovdqa 912(%rdi), %xmm11
8523 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
8524 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm16, %zmm1
8525 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8526 ; AVX512-FCP-NEXT: vmovdqa 464(%rdi), %xmm11
8527 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
8528 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm2, %zmm1
8529 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8530 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
8531 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
8532 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm15, %zmm21
8533 ; AVX512-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2
8534 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
8535 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm16
8536 ; AVX512-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
8537 ; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %ymm7
8538 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm0[6,7]
8539 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,11]
8540 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
8541 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12
8542 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
8543 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm12
8544 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
8545 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm1
8546 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8547 ; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm11
8548 ; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm1
8549 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7]
8550 ; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm25
8551 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
8552 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm15
8553 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
8554 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm15
8555 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
8556 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm1
8557 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8558 ; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11
8559 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
8560 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8561 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7]
8562 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
8563 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
8564 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
8565 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1
8566 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
8567 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1
8568 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8569 ; AVX512-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1
8570 ; AVX512-FCP-NEXT: vmovdqa 1472(%rdi), %ymm11
8571 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7]
8572 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
8573 ; AVX512-FCP-NEXT: vpermi2q %zmm28, %zmm8, %zmm0
8574 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
8575 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0
8576 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8577 ; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %ymm0
8578 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
8579 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12]
8580 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1
8581 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm7
8582 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm7
8583 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13]
8584 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm30, %zmm1
8585 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8586 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm6
8587 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm6
8588 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm3
8589 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8590 ; AVX512-FCP-NEXT: vmovdqa 960(%rdi), %ymm15
8591 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
8592 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
8593 ; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm12
8594 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
8595 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
8596 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm2
8597 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2
8598 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm3
8599 ; AVX512-FCP-NEXT: vpermi2q %zmm28, %zmm8, %zmm0
8600 ; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm30, %zmm3
8601 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8602 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm5
8603 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8604 ; AVX512-FCP-NEXT: movb $-32, %al
8605 ; AVX512-FCP-NEXT: kmovw %eax, %k2
8606 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
8607 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8608 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2}
8609 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
8610 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8611 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2}
8612 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm13
8613 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8614 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2}
8615 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
8616 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8617 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2}
8618 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8619 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1}
8620 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8621 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2}
8622 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8623 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1}
8624 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8625 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2}
8626 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8627 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1}
8628 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8629 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2}
8630 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8631 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1}
8632 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8633 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2}
8634 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
8635 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2}
8636 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8637 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
8638 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2}
8639 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8640 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 {%k2}
8641 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8642 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2}
8643 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm17, %zmm17
8644 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2}
8645 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm20, %zmm12
8646 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8647 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2}
8648 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7]
8649 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm14
8650 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
8651 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
8652 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm14, %zmm19, %zmm9
8653 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
8654 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2}
8655 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1
8656 ; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm14
8657 ; AVX512-FCP-NEXT: vmovdqa64 %ymm25, %ymm2
8658 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm14 = ymm2[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
8659 ; AVX512-FCP-NEXT: vmovdqa 1408(%rdi), %ymm15
8660 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
8661 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
8662 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm4, %zmm4
8663 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
8664 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2}
8665 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm14[4,5,6,7]
8666 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
8667 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm3
8668 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm14
8669 ; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
8670 ; AVX512-FCP-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
8671 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm14[4,5,6,7]
8672 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
8673 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2
8674 ; AVX512-FCP-NEXT: vmovdqa 1536(%rdi), %ymm14
8675 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
8676 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
8677 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
8678 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
8679 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
8680 ; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload
8681 ; AVX512-FCP-NEXT: # ymm11 = ymm6[0,1,2,3],mem[4,5,6,7]
8682 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
8683 ; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm6, %zmm11
8684 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
8685 ; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
8686 ; AVX512-FCP-NEXT: # ymm8 = ymm6[0,1,2,3],mem[4,5,6,7]
8687 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
8688 ; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm8, %zmm6, %zmm8
8689 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
8690 ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
8691 ; AVX512-FCP-NEXT: # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7]
8692 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm6
8693 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
8694 ; AVX512-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
8695 ; AVX512-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
8696 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
8697 ; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm7, %zmm10, %zmm7
8698 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi)
8699 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 128(%rsi)
8700 ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 64(%rsi)
8701 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, (%rsi)
8702 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 192(%rdx)
8703 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%rdx)
8704 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 64(%rdx)
8705 ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 128(%rdx)
8706 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 192(%rcx)
8707 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rcx)
8708 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 64(%rcx)
8709 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 128(%rcx)
8710 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8)
8711 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%r8)
8712 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 64(%r8)
8713 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 128(%r8)
8714 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
8715 ; AVX512-FCP-NEXT: vmovaps %zmm4, 192(%r9)
8716 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
8717 ; AVX512-FCP-NEXT: vmovaps %zmm4, (%r9)
8718 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
8719 ; AVX512-FCP-NEXT: vmovaps %zmm4, 64(%r9)
8720 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
8721 ; AVX512-FCP-NEXT: vmovaps %zmm4, 128(%r9)
8722 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
8723 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax)
8724 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rax)
8725 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax)
8726 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax)
8727 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
8728 ; AVX512-FCP-NEXT: vmovaps %zmm7, 128(%rax)
8729 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax)
8730 ; AVX512-FCP-NEXT: vmovaps %zmm8, (%rax)
8731 ; AVX512-FCP-NEXT: vmovaps %zmm11, 64(%rax)
8732 ; AVX512-FCP-NEXT: addq $2728, %rsp # imm = 0xAA8
8733 ; AVX512-FCP-NEXT: vzeroupper
8734 ; AVX512-FCP-NEXT: retq
8736 ; AVX512DQ-LABEL: load_i64_stride7_vf32:
8737 ; AVX512DQ: # %bb.0:
8738 ; AVX512DQ-NEXT: subq $2728, %rsp # imm = 0xAA8
8739 ; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm25
8740 ; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm1
8741 ; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm31
8742 ; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm12
8743 ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm2
8744 ; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm20
8745 ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm14
8746 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm7
8747 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8748 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm26
8749 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm19
8750 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm23
8751 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm6
8752 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,7,14,0,0,7,14,0]
8753 ; AVX512DQ-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3]
8754 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3
8755 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13
8756 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8757 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm30, %zmm3
8758 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8759 ; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8760 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
8761 ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8762 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm6
8763 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm8, %zmm6
8764 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8765 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm3
8766 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm30, %zmm3
8767 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8768 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8769 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3
8770 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm8, %zmm3
8771 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8772 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
8773 ; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8774 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0]
8775 ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
8776 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,4,11,4,11,4,11,4]
8777 ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8778 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3
8779 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm15, %zmm3
8780 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8781 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm3
8782 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm15, %zmm3
8783 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8784 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
8785 ; AVX512DQ-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8786 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3
8787 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm22, %zmm3
8788 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8789 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm3
8790 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm22, %zmm3
8791 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8792 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3
8793 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm11, %zmm3
8794 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8795 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm3
8796 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm11, %zmm3
8797 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8798 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm3
8799 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm30, %zmm3
8800 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8801 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm3
8802 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm30, %zmm3
8803 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8804 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3
8805 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm9, %zmm3
8806 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8807 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9]
8808 ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8809 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm20
8810 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm21
8811 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm9, %zmm21
8812 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm0, %zmm19
8813 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm3
8814 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8815 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm2
8816 ; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8817 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm30, %zmm2
8818 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8819 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm12
8820 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm8, %zmm12
8821 ; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm28
8822 ; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm2
8823 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5
8824 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7
8825 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8826 ; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm30, %zmm5
8827 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8828 ; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm2
8829 ; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm10
8830 ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm10, %zmm8
8831 ; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm18
8832 ; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm6
8833 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8834 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5
8835 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm11, %zmm6
8836 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8837 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm9, %zmm31
8838 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4
8839 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm6
8840 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm15, %zmm4
8841 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm17
8842 ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm15
8843 ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm4
8844 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8845 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm11, %zmm4
8846 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8847 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm26
8848 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload
8849 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm16
8850 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm4
8851 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8852 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm11, %zmm4
8853 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8854 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm9, %zmm23
8855 ; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm11
8856 ; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm4
8857 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8858 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm5, %zmm4
8859 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8860 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm27
8861 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm9, %zmm27
8862 ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm10, %zmm6
8863 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm29
8864 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm3
8865 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm22, %zmm29
8866 ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm10, %zmm22
8867 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm14
8868 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm22
8869 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm5, %zmm22
8870 ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm10, %zmm5
8871 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13
8872 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm25
8873 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm30, %zmm25
8874 ; AVX512DQ-NEXT: vpermi2q %zmm2, %zmm10, %zmm30
8875 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm24
8876 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm9, %zmm24
8877 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4
8878 ; AVX512DQ-NEXT: vpermi2q %zmm10, %zmm2, %zmm9
8879 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm7
8880 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm10
8881 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm4
8882 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm0
8883 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,9,0,5,6,9]
8884 ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
8885 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8886 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
8887 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8888 ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm3
8889 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8890 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
8891 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8892 ; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm5
8893 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm12
8894 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8895 ; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm9
8896 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm8
8897 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8898 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,10,0,5,6,10]
8899 ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
8900 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm17
8901 ; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8902 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8903 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
8904 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8905 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8906 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
8907 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8908 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm6
8909 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8910 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,11,0,5,6,11]
8911 ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
8912 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm29
8913 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8914 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
8915 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8916 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8917 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
8918 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8919 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm14
8920 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8921 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,12,0,5,6,12]
8922 ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
8923 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm22
8924 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8925 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
8926 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8927 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8928 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
8929 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8930 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm13
8931 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8932 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,13,4,5,6,13]
8933 ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
8934 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm25
8935 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
8936 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm8
8937 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
8938 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm14
8939 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm30
8940 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,14,4,5,6,14]
8941 ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
8942 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm24
8943 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8944 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
8945 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8946 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm21
8947 ; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8948 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm7
8949 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8950 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15]
8951 ; AVX512DQ-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
8952 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm2, %zmm20
8953 ; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8954 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm2, %zmm19
8955 ; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8956 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm2, %zmm10
8957 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm2, %zmm4
8958 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8959 ; AVX512DQ-NEXT: movb $24, %al
8960 ; AVX512DQ-NEXT: kmovw %eax, %k1
8961 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
8962 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8963 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5]
8964 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8965 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
8966 ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
8967 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2
8968 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
8969 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [7,0,9,0,7,0,9,0]
8970 ; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
8971 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm20
8972 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm20
8973 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm21 = [0,0,4,11]
8974 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm21, %zmm1
8975 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8976 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8977 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
8978 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm16[4,5,4,5]
8979 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8980 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm15
8981 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm15
8982 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm19
8983 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm4, %zmm19
8984 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm21, %zmm3
8985 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8986 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
8987 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8988 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm11[4,5,4,5]
8989 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8990 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
8991 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm16
8992 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm0, %zmm16
8993 ; AVX512DQ-NEXT: vpermi2q %zmm11, %zmm3, %zmm0
8994 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm17
8995 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm4, %zmm17
8996 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm11, %zmm4
8997 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm21, %zmm3
8998 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
8999 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
9000 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5]
9001 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm21, %zmm1
9002 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9003 ; AVX512DQ-NEXT: vmovdqa 912(%rdi), %xmm11
9004 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
9005 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm11, %zmm16, %zmm1
9006 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9007 ; AVX512DQ-NEXT: vmovdqa 464(%rdi), %xmm11
9008 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
9009 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm11, %zmm2, %zmm1
9010 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9011 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm2
9012 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
9013 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm15, %zmm21
9014 ; AVX512DQ-NEXT: vmovdqa 1360(%rdi), %xmm2
9015 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
9016 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm16
9017 ; AVX512DQ-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
9018 ; AVX512DQ-NEXT: vmovdqa 1024(%rdi), %ymm7
9019 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm0[6,7]
9020 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,11]
9021 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
9022 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm12
9023 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
9024 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm12
9025 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
9026 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm1
9027 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9028 ; AVX512DQ-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm11
9029 ; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm1
9030 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7]
9031 ; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm25
9032 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9033 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm15
9034 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
9035 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm15
9036 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
9037 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm1
9038 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9039 ; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11
9040 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm1
9041 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9042 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7]
9043 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
9044 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1
9045 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
9046 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm1
9047 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
9048 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1
9049 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9050 ; AVX512DQ-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1
9051 ; AVX512DQ-NEXT: vmovdqa 1472(%rdi), %ymm11
9052 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7]
9053 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9054 ; AVX512DQ-NEXT: vpermi2q %zmm28, %zmm8, %zmm0
9055 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9056 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0
9057 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9058 ; AVX512DQ-NEXT: vmovdqa 1088(%rdi), %ymm0
9059 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
9060 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12]
9061 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1
9062 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm7
9063 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm7
9064 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13]
9065 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm30, %zmm1
9066 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9067 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm6
9068 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm6
9069 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm30, %zmm3
9070 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9071 ; AVX512DQ-NEXT: vmovdqa 960(%rdi), %ymm15
9072 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
9073 ; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm15
9074 ; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm12
9075 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
9076 ; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm12
9077 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm2
9078 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm0, %zmm2
9079 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm3
9080 ; AVX512DQ-NEXT: vpermi2q %zmm28, %zmm8, %zmm0
9081 ; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm30, %zmm3
9082 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9083 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm30, %zmm5
9084 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9085 ; AVX512DQ-NEXT: movb $-32, %al
9086 ; AVX512DQ-NEXT: kmovw %eax, %k2
9087 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
9088 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9089 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2}
9090 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
9091 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9092 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2}
9093 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm13
9094 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9095 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2}
9096 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
9097 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9098 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2}
9099 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9100 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1}
9101 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9102 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2}
9103 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9104 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1}
9105 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9106 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2}
9107 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9108 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1}
9109 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9110 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2}
9111 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9112 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1}
9113 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9114 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2}
9115 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
9116 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2}
9117 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9118 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
9119 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2}
9120 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9121 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm21 {%k2}
9122 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9123 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2}
9124 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm15, %zmm17, %zmm17
9125 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2}
9126 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm12, %zmm20, %zmm12
9127 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9128 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2}
9129 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7]
9130 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm14
9131 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
9132 ; AVX512DQ-NEXT: vextracti128 $1, %ymm14, %xmm14
9133 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm14, %zmm19, %zmm9
9134 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9135 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2}
9136 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1
9137 ; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm14
9138 ; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm2
9139 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm14 = ymm2[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
9140 ; AVX512DQ-NEXT: vmovdqa 1408(%rdi), %ymm15
9141 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
9142 ; AVX512DQ-NEXT: vextracti128 $1, %ymm15, %xmm15
9143 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm15, %zmm4, %zmm4
9144 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9145 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2}
9146 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm14[4,5,6,7]
9147 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9148 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm3
9149 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm14
9150 ; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
9151 ; AVX512DQ-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
9152 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm14[4,5,6,7]
9153 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9154 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2
9155 ; AVX512DQ-NEXT: vmovdqa 1536(%rdi), %ymm14
9156 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
9157 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
9158 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9159 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
9160 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9161 ; AVX512DQ-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload
9162 ; AVX512DQ-NEXT: # ymm11 = ymm6[0,1,2,3],mem[4,5,6,7]
9163 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9164 ; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm11, %zmm6, %zmm11
9165 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9166 ; AVX512DQ-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
9167 ; AVX512DQ-NEXT: # ymm8 = ymm6[0,1,2,3],mem[4,5,6,7]
9168 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9169 ; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm8, %zmm6, %zmm8
9170 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9171 ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
9172 ; AVX512DQ-NEXT: # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7]
9173 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm6
9174 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
9175 ; AVX512DQ-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
9176 ; AVX512DQ-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
9177 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
9178 ; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm7, %zmm10, %zmm7
9179 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 192(%rsi)
9180 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, 128(%rsi)
9181 ; AVX512DQ-NEXT: vmovdqa64 %zmm30, 64(%rsi)
9182 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, (%rsi)
9183 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, 192(%rdx)
9184 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%rdx)
9185 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, 64(%rdx)
9186 ; AVX512DQ-NEXT: vmovdqa64 %zmm31, 128(%rdx)
9187 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, 192(%rcx)
9188 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, (%rcx)
9189 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, 64(%rcx)
9190 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 128(%rcx)
9191 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%r8)
9192 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%r8)
9193 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 64(%r8)
9194 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 128(%r8)
9195 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
9196 ; AVX512DQ-NEXT: vmovaps %zmm4, 192(%r9)
9197 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
9198 ; AVX512DQ-NEXT: vmovaps %zmm4, (%r9)
9199 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
9200 ; AVX512DQ-NEXT: vmovaps %zmm4, 64(%r9)
9201 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
9202 ; AVX512DQ-NEXT: vmovaps %zmm4, 128(%r9)
9203 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
9204 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 192(%rax)
9205 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax)
9206 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rax)
9207 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 128(%rax)
9208 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
9209 ; AVX512DQ-NEXT: vmovaps %zmm7, 128(%rax)
9210 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 192(%rax)
9211 ; AVX512DQ-NEXT: vmovaps %zmm8, (%rax)
9212 ; AVX512DQ-NEXT: vmovaps %zmm11, 64(%rax)
9213 ; AVX512DQ-NEXT: addq $2728, %rsp # imm = 0xAA8
9214 ; AVX512DQ-NEXT: vzeroupper
9215 ; AVX512DQ-NEXT: retq
9217 ; AVX512DQ-FCP-LABEL: load_i64_stride7_vf32:
9218 ; AVX512DQ-FCP: # %bb.0:
9219 ; AVX512DQ-FCP-NEXT: subq $2728, %rsp # imm = 0xAA8
9220 ; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm25
9221 ; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1
9222 ; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm31
9223 ; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm12
9224 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm2
9225 ; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm20
9226 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm14
9227 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
9228 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9229 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26
9230 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm19
9231 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23
9232 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm6
9233 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [0,7,14,0,0,7,14,0]
9234 ; AVX512DQ-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3]
9235 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3
9236 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
9237 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9238 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm30, %zmm3
9239 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9240 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9241 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
9242 ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9243 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm6
9244 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm6
9245 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9246 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3
9247 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm30, %zmm3
9248 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9249 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9250 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3
9251 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm8, %zmm3
9252 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9253 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
9254 ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9255 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0]
9256 ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
9257 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [11,4,11,4,11,4,11,4]
9258 ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9259 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3
9260 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm3
9261 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9262 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm3
9263 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm15, %zmm3
9264 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9265 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm22 = [12,5,12,5,12,5,12,5]
9266 ; AVX512DQ-FCP-NEXT: # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9267 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3
9268 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm22, %zmm3
9269 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9270 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm3
9271 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm22, %zmm3
9272 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9273 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3
9274 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm11, %zmm3
9275 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9276 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm3
9277 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3
9278 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9279 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm3
9280 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm30, %zmm3
9281 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9282 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm3
9283 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm30, %zmm3
9284 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9285 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3
9286 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm9, %zmm3
9287 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9288 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9]
9289 ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9290 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm20
9291 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm21
9292 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm21
9293 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm19
9294 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm3
9295 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9296 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm2
9297 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9298 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm30, %zmm2
9299 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9300 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12
9301 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm8, %zmm12
9302 ; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm28
9303 ; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm2
9304 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
9305 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7
9306 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9307 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm30, %zmm5
9308 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9309 ; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm2
9310 ; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm10
9311 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm8
9312 ; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm18
9313 ; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm6
9314 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9315 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5
9316 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm6
9317 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9318 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm9, %zmm31
9319 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
9320 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm6
9321 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm15, %zmm4
9322 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm17
9323 ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm15
9324 ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4
9325 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9326 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm11, %zmm4
9327 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9328 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm26
9329 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload
9330 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm16
9331 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
9332 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9333 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm4
9334 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9335 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm9, %zmm23
9336 ; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm11
9337 ; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm4
9338 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9339 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm5, %zmm4
9340 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9341 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm27
9342 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm9, %zmm27
9343 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm6
9344 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm29
9345 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm3
9346 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm22, %zmm29
9347 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm22
9348 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm14
9349 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm22
9350 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm5, %zmm22
9351 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm5
9352 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
9353 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm25
9354 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm25
9355 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm2, %zmm10, %zmm30
9356 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm24
9357 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm9, %zmm24
9358 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
9359 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm10, %zmm2, %zmm9
9360 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm7
9361 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm10
9362 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm4
9363 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0
9364 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,9,0,5,6,9]
9365 ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
9366 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9367 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
9368 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9369 ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3
9370 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9371 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
9372 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9373 ; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm5
9374 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm12
9375 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9376 ; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm9
9377 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm8
9378 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9379 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,10,0,5,6,10]
9380 ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
9381 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm17
9382 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9383 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9384 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
9385 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9386 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9387 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
9388 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9389 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm6
9390 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9391 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,11,0,5,6,11]
9392 ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
9393 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm29
9394 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9395 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
9396 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9397 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9398 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
9399 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9400 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm14
9401 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9402 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,5,6,12,0,5,6,12]
9403 ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
9404 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm22
9405 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9406 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
9407 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9408 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9409 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm1
9410 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9411 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm13
9412 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9413 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,13,4,5,6,13]
9414 ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
9415 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm25
9416 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9417 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm8
9418 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
9419 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm14
9420 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm30
9421 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,6,14,4,5,6,14]
9422 ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
9423 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm24
9424 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9425 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm1
9426 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9427 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm21
9428 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9429 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm7
9430 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9431 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [4,5,8,15,4,5,8,15]
9432 ; AVX512DQ-FCP-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3]
9433 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm20
9434 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9435 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm2, %zmm19
9436 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9437 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm2, %zmm10
9438 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm2, %zmm4
9439 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9440 ; AVX512DQ-FCP-NEXT: movb $24, %al
9441 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
9442 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9443 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9444 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5]
9445 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9446 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
9447 ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
9448 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2
9449 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm2
9450 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [7,0,9,0,7,0,9,0]
9451 ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
9452 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm20
9453 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm20
9454 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm21 = [0,0,4,11]
9455 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm21, %zmm1
9456 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9457 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9458 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
9459 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm16[4,5,4,5]
9460 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9461 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm15
9462 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm15
9463 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm19
9464 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm19
9465 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm21, %zmm3
9466 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9467 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
9468 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9469 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm3[4,5,4,5],zmm11[4,5,4,5]
9470 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9471 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9472 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm16
9473 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm16
9474 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm11, %zmm3, %zmm0
9475 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm17
9476 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm4, %zmm17
9477 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm11, %zmm4
9478 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm21, %zmm3
9479 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9480 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
9481 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5]
9482 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm21, %zmm1
9483 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9484 ; AVX512DQ-FCP-NEXT: vmovdqa 912(%rdi), %xmm11
9485 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
9486 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm16, %zmm1
9487 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9488 ; AVX512DQ-FCP-NEXT: vmovdqa 464(%rdi), %xmm11
9489 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
9490 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm11, %zmm2, %zmm1
9491 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9492 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
9493 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
9494 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm15, %zmm21
9495 ; AVX512DQ-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2
9496 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
9497 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm0, %zmm16
9498 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
9499 ; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %ymm7
9500 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm7[0,1,2,3,4,5],ymm0[6,7]
9501 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [4,11]
9502 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
9503 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12
9504 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
9505 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm12
9506 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
9507 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm25, %zmm1
9508 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9509 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm11
9510 ; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm1
9511 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7]
9512 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm25
9513 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9514 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm15
9515 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
9516 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm15
9517 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
9518 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm8, %zmm1
9519 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9520 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm11
9521 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
9522 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9523 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3,4,5],ymm11[6,7]
9524 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
9525 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
9526 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
9527 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1
9528 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm11[4,5,6,7]
9529 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm14, %zmm1
9530 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9531 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm1
9532 ; AVX512DQ-FCP-NEXT: vmovdqa 1472(%rdi), %ymm11
9533 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7]
9534 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9535 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm28, %zmm8, %zmm0
9536 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9537 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm30, %zmm0
9538 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9539 ; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %ymm0
9540 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm14 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
9541 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12]
9542 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1
9543 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm7
9544 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm7
9545 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13]
9546 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm30, %zmm1
9547 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9548 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm6
9549 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm6
9550 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm30, %zmm3
9551 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9552 ; AVX512DQ-FCP-NEXT: vmovdqa 960(%rdi), %ymm15
9553 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
9554 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
9555 ; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm12
9556 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
9557 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
9558 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm2
9559 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm2
9560 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm3
9561 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm28, %zmm8, %zmm0
9562 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm30, %zmm3
9563 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9564 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm5
9565 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9566 ; AVX512DQ-FCP-NEXT: movb $-32, %al
9567 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2
9568 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
9569 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9570 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm28 {%k2}
9571 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
9572 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9573 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm30 {%k2}
9574 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm13
9575 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9576 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm13 {%k2}
9577 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
9578 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9579 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm18 {%k2}
9580 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9581 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k1}
9582 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9583 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm31 {%k2}
9584 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9585 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k1}
9586 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9587 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm26 {%k2}
9588 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9589 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k1}
9590 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9591 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm23 {%k2}
9592 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9593 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k1}
9594 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9595 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm27 {%k2}
9596 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
9597 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2}
9598 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9599 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
9600 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm29 {%k2}
9601 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9602 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm21 {%k2}
9603 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9604 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm16 {%k2}
9605 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm17, %zmm17
9606 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm17 {%k2}
9607 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm20, %zmm12
9608 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9609 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12 {%k2}
9610 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm14[4,5,6,7]
9611 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm14
9612 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
9613 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14
9614 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm14, %zmm19, %zmm9
9615 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9616 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm9 {%k2}
9617 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm24, %zmm1
9618 ; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm14
9619 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, %ymm2
9620 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm14 = ymm2[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
9621 ; AVX512DQ-FCP-NEXT: vmovdqa 1408(%rdi), %ymm15
9622 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
9623 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15
9624 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm15, %zmm4, %zmm4
9625 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9626 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm4 {%k2}
9627 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3],ymm14[4,5,6,7]
9628 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9629 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm2, %zmm3
9630 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm14
9631 ; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
9632 ; AVX512DQ-FCP-NEXT: # ymm14 = mem[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
9633 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm14[4,5,6,7]
9634 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9635 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm6, %zmm2
9636 ; AVX512DQ-FCP-NEXT: vmovdqa 1536(%rdi), %ymm14
9637 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm11 = ymm11[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
9638 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
9639 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9640 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm6, %zmm0
9641 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9642 ; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm11 # 32-byte Folded Reload
9643 ; AVX512DQ-FCP-NEXT: # ymm11 = ymm6[0,1,2,3],mem[4,5,6,7]
9644 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9645 ; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm6, %zmm11
9646 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9647 ; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm8 # 32-byte Folded Reload
9648 ; AVX512DQ-FCP-NEXT: # ymm8 = ymm6[0,1,2,3],mem[4,5,6,7]
9649 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9650 ; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm8, %zmm6, %zmm8
9651 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9652 ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
9653 ; AVX512DQ-FCP-NEXT: # ymm7 = ymm6[0,1,2,3],mem[4,5,6,7]
9654 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm10, %zmm6
9655 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
9656 ; AVX512DQ-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
9657 ; AVX512DQ-FCP-NEXT: # ymm7 = ymm7[0,1,2,3],mem[4,5,6,7]
9658 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
9659 ; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm7, %zmm10, %zmm7
9660 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 192(%rsi)
9661 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 128(%rsi)
9662 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 64(%rsi)
9663 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, (%rsi)
9664 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 192(%rdx)
9665 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%rdx)
9666 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 64(%rdx)
9667 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 128(%rdx)
9668 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 192(%rcx)
9669 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rcx)
9670 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 64(%rcx)
9671 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 128(%rcx)
9672 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8)
9673 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%r8)
9674 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 64(%r8)
9675 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 128(%r8)
9676 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
9677 ; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 192(%r9)
9678 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
9679 ; AVX512DQ-FCP-NEXT: vmovaps %zmm4, (%r9)
9680 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
9681 ; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 64(%r9)
9682 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
9683 ; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 128(%r9)
9684 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
9685 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax)
9686 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rax)
9687 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 64(%rax)
9688 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 128(%rax)
9689 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
9690 ; AVX512DQ-FCP-NEXT: vmovaps %zmm7, 128(%rax)
9691 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 192(%rax)
9692 ; AVX512DQ-FCP-NEXT: vmovaps %zmm8, (%rax)
9693 ; AVX512DQ-FCP-NEXT: vmovaps %zmm11, 64(%rax)
9694 ; AVX512DQ-FCP-NEXT: addq $2728, %rsp # imm = 0xAA8
9695 ; AVX512DQ-FCP-NEXT: vzeroupper
9696 ; AVX512DQ-FCP-NEXT: retq
9698 ; AVX512BW-LABEL: load_i64_stride7_vf32:
9699 ; AVX512BW: # %bb.0:
9700 ; AVX512BW-NEXT: subq $2760, %rsp # imm = 0xAC8
9701 ; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm20
9702 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2
9703 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm29
9704 ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm4
9705 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm1
9706 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm30
9707 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm28
9708 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm12
9709 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm26
9710 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm17
9711 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm6
9712 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9713 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm3
9714 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9715 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0]
9716 ; AVX512BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
9717 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm25, %zmm3
9718 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9719 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
9720 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9721 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm6
9722 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm6
9723 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9724 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm3
9725 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9726 ; AVX512BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm3
9727 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9728 ; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9729 ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3
9730 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3
9731 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9732 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
9733 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9734 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0]
9735 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
9736 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4]
9737 ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9738 ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3
9739 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3
9740 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9741 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3
9742 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm3
9743 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9744 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
9745 ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9746 ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3
9747 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm3
9748 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9749 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3
9750 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm3
9751 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9752 ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3
9753 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3
9754 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9755 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3
9756 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3
9757 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9758 ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm3
9759 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3
9760 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9761 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm3
9762 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm25, %zmm3
9763 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9764 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3
9765 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm3
9766 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9767 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9]
9768 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9769 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm30
9770 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm10
9771 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm10
9772 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm17
9773 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9774 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1
9775 ; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9776 ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm25, %zmm1
9777 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9778 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6
9779 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm6
9780 ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm24
9781 ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm15
9782 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm3
9783 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9784 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm25, %zmm3
9785 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9786 ; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm1
9787 ; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm16
9788 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm8
9789 ; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm13
9790 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm5
9791 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9792 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm11, %zmm5
9793 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9794 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm29
9795 ; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9796 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4
9797 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm4
9798 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm14
9799 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm18
9800 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm4
9801 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9802 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm11, %zmm4
9803 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9804 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm9, %zmm28
9805 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm19
9806 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm4
9807 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9808 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm11, %zmm4
9809 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9810 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
9811 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload
9812 ; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm12
9813 ; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm4
9814 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9815 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm11, %zmm4
9816 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9817 ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm22
9818 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm9, %zmm22
9819 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm7
9820 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm29
9821 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm29
9822 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm23
9823 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm31
9824 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm11, %zmm31
9825 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm11
9826 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm15
9827 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm25, %zmm15
9828 ; AVX512BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm25
9829 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm21
9830 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm21
9831 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm4
9832 ; AVX512BW-NEXT: vpermi2q %zmm16, %zmm1, %zmm9
9833 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm16
9834 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm4
9835 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm0
9836 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
9837 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
9838 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9839 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
9840 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9841 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm3
9842 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9843 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
9844 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9845 ; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm5
9846 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6
9847 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9848 ; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm6
9849 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8
9850 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9851 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
9852 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
9853 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm14
9854 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9855 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9856 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
9857 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9858 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9859 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
9860 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9861 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm7
9862 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9863 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
9864 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
9865 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm29
9866 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9867 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
9868 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9869 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9870 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
9871 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9872 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm23
9873 ; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9874 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
9875 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
9876 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm31
9877 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9878 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
9879 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9880 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9881 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
9882 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9883 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm11
9884 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9885 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,13,4,5,6,13]
9886 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
9887 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm15
9888 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
9889 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
9890 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
9891 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm7
9892 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm25
9893 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,14,4,5,6,14]
9894 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
9895 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm21
9896 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9897 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm8
9898 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9899 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm10
9900 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9901 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm9
9902 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9903 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
9904 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
9905 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm30
9906 ; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9907 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm17
9908 ; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9909 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm16
9910 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4
9911 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9912 ; AVX512BW-NEXT: movb $24, %al
9913 ; AVX512BW-NEXT: kmovd %eax, %k1
9914 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
9915 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9916 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5]
9917 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9918 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
9919 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
9920 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3
9921 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
9922 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0]
9923 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
9924 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm27
9925 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm27
9926 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,11]
9927 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm1
9928 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9929 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9930 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
9931 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm4[4,5,4,5],zmm19[4,5,4,5]
9932 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9933 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm18
9934 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm18
9935 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm23
9936 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm23
9937 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm4
9938 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9939 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
9940 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
9941 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm4[4,5,4,5],zmm12[4,5,4,5]
9942 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9943 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm19
9944 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm19
9945 ; AVX512BW-NEXT: vpermi2q %zmm12, %zmm4, %zmm0
9946 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm20
9947 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm20
9948 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm12, %zmm10
9949 ; AVX512BW-NEXT: vpermt2q %zmm12, %zmm5, %zmm4
9950 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9951 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
9952 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm13[4,5,4,5]
9953 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm5, %zmm1
9954 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9955 ; AVX512BW-NEXT: vmovdqa 912(%rdi), %xmm5
9956 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
9957 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm1
9958 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9959 ; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm5
9960 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
9961 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm3, %zmm1
9962 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9963 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm3
9964 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
9965 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm3, %zmm18, %zmm19
9966 ; AVX512BW-NEXT: vmovdqa 1360(%rdi), %xmm3
9967 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
9968 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm18
9969 ; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
9970 ; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm13
9971 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
9972 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11]
9973 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
9974 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12
9975 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
9976 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm12
9977 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
9978 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0
9979 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9980 ; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0
9981 ; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm1
9982 ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9983 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
9984 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
9985 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11
9986 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
9987 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm11
9988 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
9989 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
9990 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
9991 ; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0
9992 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm12
9993 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
9994 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
9995 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm11
9996 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
9997 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11
9998 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
9999 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
10000 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10001 ; AVX512BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0
10002 ; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm2
10003 ; AVX512BW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
10004 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
10005 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10006 ; AVX512BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm3
10007 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
10008 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0
10009 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10010 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm7 = [5,12]
10011 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11
10012 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm11
10013 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13]
10014 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm6
10015 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10016 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3
10017 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm3
10018 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm30, %zmm5
10019 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10020 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm0
10021 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm4
10022 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm7, %zmm0
10023 ; AVX512BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm7
10024 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm30, %zmm2
10025 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8
10026 ; AVX512BW-NEXT: vpermt2q %zmm4, %zmm30, %zmm9
10027 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10028 ; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %ymm24
10029 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm13[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23]
10030 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm5[4,5,6,7]
10031 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm0
10032 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10033 ; AVX512BW-NEXT: movb $-32, %al
10034 ; AVX512BW-NEXT: kmovd %eax, %k2
10035 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
10036 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10037 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2}
10038 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
10039 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10040 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2}
10041 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm2
10042 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10043 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2}
10044 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm14
10045 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10046 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2}
10047 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10048 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10049 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
10050 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10051 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
10052 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm30
10053 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10054 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1}
10055 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10056 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2}
10057 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10058 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
10059 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10060 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2}
10061 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10062 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
10063 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10064 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2}
10065 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
10066 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2}
10067 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10068 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
10069 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2}
10070 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10071 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2}
10072 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10073 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2}
10074 ; AVX512BW-NEXT: vmovdqa 960(%rdi), %ymm1
10075 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
10076 ; AVX512BW-NEXT: vextracti128 $1, %ymm1, %xmm1
10077 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm1
10078 ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1 {%k2}
10079 ; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm6
10080 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
10081 ; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm6
10082 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm27, %zmm6
10083 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10084 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2}
10085 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %ymm17
10086 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
10087 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm17, %xmm17
10088 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm23, %zmm15
10089 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10090 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2}
10091 ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %ymm17
10092 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
10093 ; AVX512BW-NEXT: vextracti32x4 $1, %ymm17, %xmm17
10094 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm17, %zmm10, %zmm21
10095 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10096 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2}
10097 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %ymm17
10098 ; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17, %ymm13 # 32-byte Folded Reload
10099 ; AVX512BW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
10100 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm13[4,5,6,7]
10101 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10102 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4
10103 ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm13
10104 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
10105 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7]
10106 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10107 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3
10108 ; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm12
10109 ; AVX512BW-NEXT: vpalignr $8, (%rsp), %ymm12, %ymm11 # 32-byte Folded Reload
10110 ; AVX512BW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
10111 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm11[4,5,6,7]
10112 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10113 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
10114 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10115 ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload
10116 ; AVX512BW-NEXT: # ymm11 = ymm7[0,1,2,3],mem[4,5,6,7]
10117 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10118 ; AVX512BW-NEXT: vinsertf64x4 $0, %ymm11, %zmm7, %zmm11
10119 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10120 ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
10121 ; AVX512BW-NEXT: # ymm9 = ymm7[0,1,2,3],mem[4,5,6,7]
10122 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10123 ; AVX512BW-NEXT: vinsertf64x4 $0, %ymm9, %zmm7, %zmm9
10124 ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
10125 ; AVX512BW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
10126 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm7
10127 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
10128 ; AVX512BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
10129 ; AVX512BW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
10130 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
10131 ; AVX512BW-NEXT: vinsertf64x4 $0, %ymm8, %zmm10, %zmm8
10132 ; AVX512BW-NEXT: vmovdqa64 %zmm14, 192(%rsi)
10133 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 128(%rsi)
10134 ; AVX512BW-NEXT: vmovdqa64 %zmm25, 64(%rsi)
10135 ; AVX512BW-NEXT: vmovdqa64 %zmm24, (%rsi)
10136 ; AVX512BW-NEXT: vmovdqa64 %zmm22, 192(%rdx)
10137 ; AVX512BW-NEXT: vmovdqa64 %zmm26, (%rdx)
10138 ; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rdx)
10139 ; AVX512BW-NEXT: vmovdqa64 %zmm30, 128(%rdx)
10140 ; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rcx)
10141 ; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rcx)
10142 ; AVX512BW-NEXT: vmovdqa64 %zmm29, 64(%rcx)
10143 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rcx)
10144 ; AVX512BW-NEXT: vmovdqa64 %zmm21, 192(%r8)
10145 ; AVX512BW-NEXT: vmovdqa64 %zmm15, (%r8)
10146 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 64(%r8)
10147 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 128(%r8)
10148 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10149 ; AVX512BW-NEXT: vmovaps %zmm1, 192(%r9)
10150 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10151 ; AVX512BW-NEXT: vmovaps %zmm1, (%r9)
10152 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10153 ; AVX512BW-NEXT: vmovaps %zmm1, 64(%r9)
10154 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10155 ; AVX512BW-NEXT: vmovaps %zmm1, 128(%r9)
10156 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
10157 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 192(%rax)
10158 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax)
10159 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax)
10160 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10161 ; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax)
10162 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
10163 ; AVX512BW-NEXT: vmovaps %zmm8, 128(%rax)
10164 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 192(%rax)
10165 ; AVX512BW-NEXT: vmovaps %zmm9, (%rax)
10166 ; AVX512BW-NEXT: vmovaps %zmm11, 64(%rax)
10167 ; AVX512BW-NEXT: addq $2760, %rsp # imm = 0xAC8
10168 ; AVX512BW-NEXT: vzeroupper
10169 ; AVX512BW-NEXT: retq
10171 ; AVX512BW-FCP-LABEL: load_i64_stride7_vf32:
10172 ; AVX512BW-FCP: # %bb.0:
10173 ; AVX512BW-FCP-NEXT: subq $2760, %rsp # imm = 0xAC8
10174 ; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm20
10175 ; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2
10176 ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm29
10177 ; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4
10178 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1
10179 ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm30
10180 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm28
10181 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm12
10182 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26
10183 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm17
10184 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6
10185 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10186 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3
10187 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10188 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0]
10189 ; AVX512BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
10190 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm25, %zmm3
10191 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10192 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
10193 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10194 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6
10195 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm6
10196 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10197 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3
10198 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10199 ; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm3
10200 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10201 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10202 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3
10203 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3
10204 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10205 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
10206 ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10207 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0]
10208 ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
10209 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4]
10210 ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10211 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3
10212 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm3
10213 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10214 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3
10215 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm7, %zmm3
10216 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10217 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
10218 ; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10219 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3
10220 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm3
10221 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10222 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3
10223 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm3
10224 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10225 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3
10226 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3
10227 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10228 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3
10229 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3
10230 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10231 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3
10232 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3
10233 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10234 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3
10235 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm3
10236 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10237 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3
10238 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm3
10239 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10240 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9]
10241 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10242 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm30
10243 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10
10244 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm9, %zmm10
10245 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm17
10246 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10247 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1
10248 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10249 ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm1
10250 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10251 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6
10252 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm6
10253 ; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm24
10254 ; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm15
10255 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3
10256 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10257 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm3
10258 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10259 ; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1
10260 ; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm16
10261 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm8
10262 ; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13
10263 ; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm5
10264 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10265 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm5
10266 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10267 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm29
10268 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10269 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4
10270 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm7, %zmm4
10271 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14
10272 ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm18
10273 ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4
10274 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10275 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm4
10276 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10277 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm9, %zmm28
10278 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19
10279 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
10280 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10281 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm4
10282 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10283 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
10284 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload
10285 ; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm12
10286 ; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm4
10287 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10288 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm11, %zmm4
10289 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10290 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm22
10291 ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm9, %zmm22
10292 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm7
10293 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29
10294 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm29
10295 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm23
10296 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31
10297 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm31
10298 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm11
10299 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm15
10300 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm25, %zmm15
10301 ; AVX512BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm25
10302 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm21
10303 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm21
10304 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4
10305 ; AVX512BW-FCP-NEXT: vpermi2q %zmm16, %zmm1, %zmm9
10306 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm16
10307 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm4
10308 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0
10309 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
10310 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
10311 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10312 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
10313 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10314 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3
10315 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10316 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
10317 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10318 ; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm5
10319 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm6
10320 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10321 ; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm6
10322 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm8
10323 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10324 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
10325 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
10326 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm14
10327 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10328 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10329 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
10330 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10331 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10332 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
10333 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10334 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm7
10335 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10336 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
10337 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
10338 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm29
10339 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10340 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
10341 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10342 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10343 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
10344 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10345 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm23
10346 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10347 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
10348 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
10349 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm31
10350 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10351 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
10352 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10353 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10354 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
10355 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10356 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm11
10357 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10358 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,13,4,5,6,13]
10359 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
10360 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm15
10361 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10362 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
10363 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10364 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm7
10365 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm25
10366 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,14,4,5,6,14]
10367 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
10368 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm21
10369 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
10370 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm8
10371 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10372 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm10
10373 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10374 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm9
10375 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10376 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
10377 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
10378 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm30
10379 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10380 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm17
10381 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10382 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm16
10383 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm4
10384 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10385 ; AVX512BW-FCP-NEXT: movb $24, %al
10386 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
10387 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10388 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10389 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5]
10390 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10391 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
10392 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
10393 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3
10394 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
10395 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0]
10396 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
10397 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm27
10398 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm27
10399 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,11]
10400 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm5, %zmm1
10401 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10402 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10403 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
10404 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm4[4,5,4,5],zmm19[4,5,4,5]
10405 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10406 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm18
10407 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18
10408 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm23
10409 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm23
10410 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm4
10411 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10412 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
10413 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
10414 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm4[4,5,4,5],zmm12[4,5,4,5]
10415 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10416 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm19
10417 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm19
10418 ; AVX512BW-FCP-NEXT: vpermi2q %zmm12, %zmm4, %zmm0
10419 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm20
10420 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm20
10421 ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm12, %zmm10
10422 ; AVX512BW-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm4
10423 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10424 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
10425 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm13[4,5,4,5]
10426 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm1
10427 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10428 ; AVX512BW-FCP-NEXT: vmovdqa 912(%rdi), %xmm5
10429 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
10430 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm1
10431 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10432 ; AVX512BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm5
10433 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
10434 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm3, %zmm1
10435 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10436 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
10437 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
10438 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm18, %zmm19
10439 ; AVX512BW-FCP-NEXT: vmovdqa 1360(%rdi), %xmm3
10440 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
10441 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm18
10442 ; AVX512BW-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
10443 ; AVX512BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm13
10444 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
10445 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11]
10446 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
10447 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12
10448 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
10449 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm3, %zmm12
10450 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
10451 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0
10452 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10453 ; AVX512BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0
10454 ; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm1
10455 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10456 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
10457 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
10458 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
10459 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10460 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm11
10461 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
10462 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
10463 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10464 ; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0
10465 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm12
10466 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
10467 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
10468 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11
10469 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
10470 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm11
10471 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
10472 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
10473 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10474 ; AVX512BW-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0
10475 ; AVX512BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm2
10476 ; AVX512BW-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
10477 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
10478 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10479 ; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm3
10480 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
10481 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0
10482 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10483 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [5,12]
10484 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
10485 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm11
10486 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13]
10487 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm6
10488 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10489 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3
10490 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm3
10491 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm5
10492 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10493 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0
10494 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4
10495 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm7, %zmm0
10496 ; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm7
10497 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm30, %zmm2
10498 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
10499 ; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm9
10500 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10501 ; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %ymm24
10502 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm13[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23]
10503 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm5[4,5,6,7]
10504 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm0
10505 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10506 ; AVX512BW-FCP-NEXT: movb $-32, %al
10507 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2
10508 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
10509 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10510 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2}
10511 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
10512 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10513 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2}
10514 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2
10515 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10516 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2}
10517 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14
10518 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10519 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2}
10520 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10521 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10522 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
10523 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10524 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
10525 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30
10526 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10527 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1}
10528 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10529 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2}
10530 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10531 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
10532 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10533 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2}
10534 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10535 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
10536 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10537 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2}
10538 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
10539 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2}
10540 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10541 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
10542 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2}
10543 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10544 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2}
10545 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10546 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2}
10547 ; AVX512BW-FCP-NEXT: vmovdqa 960(%rdi), %ymm1
10548 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
10549 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
10550 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm1
10551 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 {%k2}
10552 ; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm6
10553 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
10554 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
10555 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm27, %zmm6
10556 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10557 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2}
10558 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm17
10559 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
10560 ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17
10561 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm23, %zmm15
10562 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10563 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2}
10564 ; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %ymm17
10565 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
10566 ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17
10567 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm10, %zmm21
10568 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10569 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2}
10570 ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %ymm17
10571 ; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17, %ymm13 # 32-byte Folded Reload
10572 ; AVX512BW-FCP-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
10573 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm13[4,5,6,7]
10574 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10575 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4
10576 ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm13
10577 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
10578 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7]
10579 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10580 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3
10581 ; AVX512BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm12
10582 ; AVX512BW-FCP-NEXT: vpalignr $8, (%rsp), %ymm12, %ymm11 # 32-byte Folded Reload
10583 ; AVX512BW-FCP-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
10584 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm11[4,5,6,7]
10585 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10586 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
10587 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10588 ; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload
10589 ; AVX512BW-FCP-NEXT: # ymm11 = ymm7[0,1,2,3],mem[4,5,6,7]
10590 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10591 ; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm7, %zmm11
10592 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10593 ; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
10594 ; AVX512BW-FCP-NEXT: # ymm9 = ymm7[0,1,2,3],mem[4,5,6,7]
10595 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10596 ; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm9, %zmm7, %zmm9
10597 ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
10598 ; AVX512BW-FCP-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
10599 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm7
10600 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
10601 ; AVX512BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
10602 ; AVX512BW-FCP-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
10603 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
10604 ; AVX512BW-FCP-NEXT: vinsertf64x4 $0, %ymm8, %zmm10, %zmm8
10605 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 192(%rsi)
10606 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rsi)
10607 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%rsi)
10608 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, (%rsi)
10609 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 192(%rdx)
10610 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, (%rdx)
10611 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 64(%rdx)
10612 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, 128(%rdx)
10613 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rcx)
10614 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
10615 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 64(%rcx)
10616 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%rcx)
10617 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 192(%r8)
10618 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%r8)
10619 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8)
10620 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%r8)
10621 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10622 ; AVX512BW-FCP-NEXT: vmovaps %zmm1, 192(%r9)
10623 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10624 ; AVX512BW-FCP-NEXT: vmovaps %zmm1, (%r9)
10625 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10626 ; AVX512BW-FCP-NEXT: vmovaps %zmm1, 64(%r9)
10627 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10628 ; AVX512BW-FCP-NEXT: vmovaps %zmm1, 128(%r9)
10629 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
10630 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax)
10631 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
10632 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax)
10633 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10634 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rax)
10635 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
10636 ; AVX512BW-FCP-NEXT: vmovaps %zmm8, 128(%rax)
10637 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 192(%rax)
10638 ; AVX512BW-FCP-NEXT: vmovaps %zmm9, (%rax)
10639 ; AVX512BW-FCP-NEXT: vmovaps %zmm11, 64(%rax)
10640 ; AVX512BW-FCP-NEXT: addq $2760, %rsp # imm = 0xAC8
10641 ; AVX512BW-FCP-NEXT: vzeroupper
10642 ; AVX512BW-FCP-NEXT: retq
10644 ; AVX512DQ-BW-LABEL: load_i64_stride7_vf32:
10645 ; AVX512DQ-BW: # %bb.0:
10646 ; AVX512DQ-BW-NEXT: subq $2760, %rsp # imm = 0xAC8
10647 ; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm20
10648 ; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm2
10649 ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm29
10650 ; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm4
10651 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm1
10652 ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm30
10653 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm28
10654 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm12
10655 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm26
10656 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm17
10657 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm6
10658 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10659 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm3
10660 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10661 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0]
10662 ; AVX512DQ-BW-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
10663 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm25, %zmm3
10664 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10665 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
10666 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10667 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm6
10668 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm8, %zmm6
10669 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10670 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm3
10671 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10672 ; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm25, %zmm3
10673 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10674 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10675 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3
10676 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm8, %zmm3
10677 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10678 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
10679 ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10680 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0]
10681 ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
10682 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4]
10683 ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10684 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3
10685 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm3
10686 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10687 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm3
10688 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm7, %zmm3
10689 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10690 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
10691 ; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10692 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3
10693 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm23, %zmm3
10694 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10695 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm3
10696 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm23, %zmm3
10697 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10698 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3
10699 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm11, %zmm3
10700 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10701 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm3
10702 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm11, %zmm3
10703 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10704 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm3
10705 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm25, %zmm3
10706 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10707 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm3
10708 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm25, %zmm3
10709 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10710 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3
10711 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm3
10712 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10713 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9]
10714 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
10715 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm30
10716 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm10
10717 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm9, %zmm10
10718 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm17
10719 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10720 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1
10721 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10722 ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm25, %zmm1
10723 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10724 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6
10725 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm8, %zmm6
10726 ; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm24
10727 ; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm15
10728 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm3
10729 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10730 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm25, %zmm3
10731 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10732 ; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm1
10733 ; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm16
10734 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm8
10735 ; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm13
10736 ; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm5
10737 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10738 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm11, %zmm5
10739 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10740 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm9, %zmm29
10741 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10742 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4
10743 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm7, %zmm4
10744 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm14
10745 ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm18
10746 ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm4
10747 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10748 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm11, %zmm4
10749 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10750 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm9, %zmm28
10751 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm19
10752 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm4
10753 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10754 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm11, %zmm4
10755 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10756 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
10757 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload
10758 ; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm12
10759 ; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm4
10760 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10761 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm11, %zmm4
10762 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10763 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm22
10764 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm9, %zmm22
10765 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm7
10766 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm29
10767 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm23, %zmm29
10768 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm23
10769 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm31
10770 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm11, %zmm31
10771 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm11
10772 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm15
10773 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm25, %zmm15
10774 ; AVX512DQ-BW-NEXT: vpermi2q %zmm1, %zmm16, %zmm25
10775 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm21
10776 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm9, %zmm21
10777 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm4
10778 ; AVX512DQ-BW-NEXT: vpermi2q %zmm16, %zmm1, %zmm9
10779 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm0, %zmm16
10780 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm4
10781 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm0
10782 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
10783 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
10784 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10785 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
10786 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10787 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm3
10788 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10789 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
10790 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10791 ; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm5
10792 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm6
10793 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10794 ; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm6
10795 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm8
10796 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10797 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
10798 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
10799 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm14
10800 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10801 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10802 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
10803 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10804 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10805 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
10806 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10807 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm7
10808 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10809 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
10810 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
10811 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm29
10812 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10813 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
10814 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10815 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10816 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
10817 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10818 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm23
10819 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10820 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
10821 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
10822 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm31
10823 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10824 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
10825 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10826 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10827 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
10828 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10829 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm11
10830 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10831 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,13,4,5,6,13]
10832 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
10833 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm15
10834 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10835 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
10836 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
10837 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm7
10838 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm25
10839 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,14,4,5,6,14]
10840 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
10841 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm21
10842 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
10843 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm8
10844 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10845 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm10
10846 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10847 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm9
10848 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10849 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
10850 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
10851 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm1, %zmm30
10852 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10853 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm1, %zmm17
10854 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10855 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm1, %zmm16
10856 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm1, %zmm4
10857 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10858 ; AVX512DQ-BW-NEXT: movb $24, %al
10859 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
10860 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10861 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10862 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5]
10863 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10864 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
10865 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
10866 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3
10867 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
10868 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0]
10869 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
10870 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm27
10871 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm27
10872 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,11]
10873 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm5, %zmm1
10874 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10875 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10876 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
10877 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm4[4,5,4,5],zmm19[4,5,4,5]
10878 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10879 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm18
10880 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm18
10881 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm23
10882 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm10, %zmm23
10883 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm5, %zmm4
10884 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10885 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
10886 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
10887 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm4[4,5,4,5],zmm12[4,5,4,5]
10888 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10889 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm19
10890 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm0, %zmm19
10891 ; AVX512DQ-BW-NEXT: vpermi2q %zmm12, %zmm4, %zmm0
10892 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm20
10893 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm10, %zmm20
10894 ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm12, %zmm10
10895 ; AVX512DQ-BW-NEXT: vpermt2q %zmm12, %zmm5, %zmm4
10896 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10897 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
10898 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm13[4,5,4,5]
10899 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm5, %zmm1
10900 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10901 ; AVX512DQ-BW-NEXT: vmovdqa 912(%rdi), %xmm5
10902 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
10903 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm1
10904 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10905 ; AVX512DQ-BW-NEXT: vmovdqa 464(%rdi), %xmm5
10906 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
10907 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm3, %zmm1
10908 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10909 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm3
10910 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
10911 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm3, %zmm18, %zmm19
10912 ; AVX512DQ-BW-NEXT: vmovdqa 1360(%rdi), %xmm3
10913 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
10914 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm18
10915 ; AVX512DQ-BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
10916 ; AVX512DQ-BW-NEXT: vmovdqa 1024(%rdi), %ymm13
10917 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
10918 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11]
10919 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
10920 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm12
10921 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
10922 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm3, %zmm12
10923 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
10924 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0
10925 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10926 ; AVX512DQ-BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0
10927 ; AVX512DQ-BW-NEXT: vmovdqa 576(%rdi), %ymm1
10928 ; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10929 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
10930 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
10931 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11
10932 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10933 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm3, %zmm11
10934 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
10935 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
10936 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10937 ; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0
10938 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm12
10939 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
10940 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
10941 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm11
10942 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
10943 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm3, %zmm11
10944 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
10945 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
10946 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10947 ; AVX512DQ-BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0
10948 ; AVX512DQ-BW-NEXT: vmovdqa 1472(%rdi), %ymm2
10949 ; AVX512DQ-BW-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
10950 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
10951 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
10952 ; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm3
10953 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
10954 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0
10955 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10956 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm7 = [5,12]
10957 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11
10958 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm7, %zmm11
10959 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13]
10960 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm6
10961 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10962 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3
10963 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm7, %zmm3
10964 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm30, %zmm5
10965 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10966 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm0
10967 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm4
10968 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm7, %zmm0
10969 ; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm7
10970 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm30, %zmm2
10971 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8
10972 ; AVX512DQ-BW-NEXT: vpermt2q %zmm4, %zmm30, %zmm9
10973 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10974 ; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %ymm24
10975 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm5 = ymm13[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23]
10976 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm5[4,5,6,7]
10977 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm0
10978 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
10979 ; AVX512DQ-BW-NEXT: movb $-32, %al
10980 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2
10981 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
10982 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10983 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2}
10984 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
10985 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10986 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2}
10987 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm2
10988 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10989 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2}
10990 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm14
10991 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10992 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2}
10993 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10994 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
10995 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
10996 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
10997 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
10998 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm30
10999 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11000 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1}
11001 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11002 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2}
11003 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11004 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
11005 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11006 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2}
11007 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11008 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
11009 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11010 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2}
11011 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
11012 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2}
11013 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11014 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
11015 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2}
11016 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11017 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2}
11018 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11019 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2}
11020 ; AVX512DQ-BW-NEXT: vmovdqa 960(%rdi), %ymm1
11021 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
11022 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm1, %xmm1
11023 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm1
11024 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm1 {%k2}
11025 ; AVX512DQ-BW-NEXT: vmovdqa 512(%rdi), %ymm6
11026 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
11027 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm6
11028 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm6, %zmm27, %zmm6
11029 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11030 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2}
11031 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %ymm17
11032 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
11033 ; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm17, %xmm17
11034 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm17, %zmm23, %zmm15
11035 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11036 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2}
11037 ; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %ymm17
11038 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
11039 ; AVX512DQ-BW-NEXT: vextracti32x4 $1, %ymm17, %xmm17
11040 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm17, %zmm10, %zmm21
11041 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11042 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2}
11043 ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %ymm17
11044 ; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17, %ymm13 # 32-byte Folded Reload
11045 ; AVX512DQ-BW-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
11046 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm13[4,5,6,7]
11047 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11048 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4
11049 ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm13
11050 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
11051 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7]
11052 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11053 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3
11054 ; AVX512DQ-BW-NEXT: vmovdqa 1536(%rdi), %ymm12
11055 ; AVX512DQ-BW-NEXT: vpalignr $8, (%rsp), %ymm12, %ymm11 # 32-byte Folded Reload
11056 ; AVX512DQ-BW-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
11057 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm11[4,5,6,7]
11058 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11059 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
11060 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11061 ; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload
11062 ; AVX512DQ-BW-NEXT: # ymm11 = ymm7[0,1,2,3],mem[4,5,6,7]
11063 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11064 ; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm11, %zmm7, %zmm11
11065 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11066 ; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
11067 ; AVX512DQ-BW-NEXT: # ymm9 = ymm7[0,1,2,3],mem[4,5,6,7]
11068 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11069 ; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm9, %zmm7, %zmm9
11070 ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
11071 ; AVX512DQ-BW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
11072 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm7
11073 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
11074 ; AVX512DQ-BW-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
11075 ; AVX512DQ-BW-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
11076 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
11077 ; AVX512DQ-BW-NEXT: vinsertf64x4 $0, %ymm8, %zmm10, %zmm8
11078 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 192(%rsi)
11079 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 128(%rsi)
11080 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 64(%rsi)
11081 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, (%rsi)
11082 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 192(%rdx)
11083 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, (%rdx)
11084 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 64(%rdx)
11085 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, 128(%rdx)
11086 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 192(%rcx)
11087 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rcx)
11088 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 64(%rcx)
11089 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 128(%rcx)
11090 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 192(%r8)
11091 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, (%r8)
11092 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 64(%r8)
11093 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 128(%r8)
11094 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11095 ; AVX512DQ-BW-NEXT: vmovaps %zmm1, 192(%r9)
11096 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11097 ; AVX512DQ-BW-NEXT: vmovaps %zmm1, (%r9)
11098 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11099 ; AVX512DQ-BW-NEXT: vmovaps %zmm1, 64(%r9)
11100 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11101 ; AVX512DQ-BW-NEXT: vmovaps %zmm1, 128(%r9)
11102 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
11103 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 192(%rax)
11104 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax)
11105 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%rax)
11106 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11107 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%rax)
11108 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
11109 ; AVX512DQ-BW-NEXT: vmovaps %zmm8, 128(%rax)
11110 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 192(%rax)
11111 ; AVX512DQ-BW-NEXT: vmovaps %zmm9, (%rax)
11112 ; AVX512DQ-BW-NEXT: vmovaps %zmm11, 64(%rax)
11113 ; AVX512DQ-BW-NEXT: addq $2760, %rsp # imm = 0xAC8
11114 ; AVX512DQ-BW-NEXT: vzeroupper
11115 ; AVX512DQ-BW-NEXT: retq
11117 ; AVX512DQ-BW-FCP-LABEL: load_i64_stride7_vf32:
11118 ; AVX512DQ-BW-FCP: # %bb.0:
11119 ; AVX512DQ-BW-FCP-NEXT: subq $2760, %rsp # imm = 0xAC8
11120 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm20
11121 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2
11122 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm29
11123 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4
11124 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm1
11125 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm30
11126 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm28
11127 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm12
11128 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm26
11129 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm17
11130 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm6
11131 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11132 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm3
11133 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11134 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm25 = [0,7,14,0,0,7,14,0]
11135 ; AVX512DQ-BW-FCP-NEXT: # zmm25 = mem[0,1,2,3,0,1,2,3]
11136 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm25, %zmm3
11137 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11138 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [10,3,10,3,10,3,10,3]
11139 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11140 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm6
11141 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm6
11142 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11143 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm3
11144 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11145 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm25, %zmm3
11146 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11147 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11148 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3
11149 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm8, %zmm3
11150 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11151 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [13,6,13,6,13,6,13,6]
11152 ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11153 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = [9,0,7,0,9,0,7,0]
11154 ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3]
11155 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm7 = [11,4,11,4,11,4,11,4]
11156 ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11157 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3
11158 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm3
11159 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11160 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3
11161 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm7, %zmm3
11162 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11163 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [12,5,12,5,12,5,12,5]
11164 ; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11165 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3
11166 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm23, %zmm3
11167 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11168 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3
11169 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm3
11170 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11171 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3
11172 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm11, %zmm3
11173 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11174 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3
11175 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm3
11176 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11177 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm3
11178 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm25, %zmm3
11179 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11180 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm3
11181 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm25, %zmm3
11182 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11183 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3
11184 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm3
11185 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11186 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,9,2,9,2,9,2,9]
11187 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
11188 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm30
11189 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm10
11190 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm9, %zmm10
11191 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm17
11192 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11193 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1
11194 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11195 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm25, %zmm1
11196 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11197 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6
11198 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm6
11199 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm24
11200 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm15
11201 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm3
11202 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11203 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm25, %zmm3
11204 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11205 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm1
11206 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm16
11207 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm8
11208 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm13
11209 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm5
11210 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11211 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm5
11212 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11213 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm9, %zmm29
11214 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11215 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4
11216 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm7, %zmm4
11217 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm14
11218 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm18
11219 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm4
11220 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11221 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm4
11222 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11223 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm9, %zmm28
11224 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm19
11225 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
11226 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11227 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm4
11228 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11229 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
11230 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm9, %zmm26 # 64-byte Folded Reload
11231 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm12
11232 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm4
11233 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11234 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm11, %zmm4
11235 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11236 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm22
11237 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm9, %zmm22
11238 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm7
11239 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm29
11240 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm29
11241 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm23
11242 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm31
11243 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm31
11244 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm11
11245 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm15
11246 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm25, %zmm15
11247 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm1, %zmm16, %zmm25
11248 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm21
11249 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm9, %zmm21
11250 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm4
11251 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm16, %zmm1, %zmm9
11252 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm0, %zmm16
11253 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm4
11254 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm0
11255 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
11256 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
11257 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
11258 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
11259 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11260 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm3
11261 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
11262 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
11263 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11264 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm5
11265 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm6
11266 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11267 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm6
11268 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm8
11269 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11270 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
11271 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
11272 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm14
11273 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11274 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
11275 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
11276 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11277 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
11278 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
11279 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11280 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm7
11281 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11282 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
11283 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
11284 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm29
11285 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
11286 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
11287 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11288 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
11289 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
11290 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11291 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm23
11292 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11293 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
11294 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
11295 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm31
11296 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
11297 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
11298 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11299 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
11300 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm2
11301 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11302 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm11
11303 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11304 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,13,4,5,6,13]
11305 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
11306 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm15
11307 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
11308 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm2
11309 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11310 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm7
11311 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm25
11312 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,6,14,4,5,6,14]
11313 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
11314 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm21
11315 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
11316 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm8
11317 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11318 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm10
11319 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11320 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm9
11321 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11322 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
11323 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
11324 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm30
11325 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11326 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm1, %zmm17
11327 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11328 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm16
11329 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm1, %zmm4
11330 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11331 ; AVX512DQ-BW-FCP-NEXT: movb $24, %al
11332 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
11333 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11334 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11335 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[4,5,4,5],zmm18[4,5,4,5]
11336 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11337 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
11338 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
11339 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3
11340 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm0, %zmm3
11341 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [7,0,9,0,7,0,9,0]
11342 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
11343 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm27
11344 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm27
11345 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm5 = [0,0,4,11]
11346 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm5, %zmm1
11347 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11348 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11349 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
11350 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm4[4,5,4,5],zmm19[4,5,4,5]
11351 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11352 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm18
11353 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm18
11354 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm23
11355 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm10, %zmm23
11356 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm5, %zmm4
11357 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11358 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
11359 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
11360 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm4[4,5,4,5],zmm12[4,5,4,5]
11361 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11362 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm19
11363 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm0, %zmm19
11364 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm12, %zmm4, %zmm0
11365 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm20
11366 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm10, %zmm20
11367 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm12, %zmm10
11368 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm12, %zmm5, %zmm4
11369 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11370 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
11371 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm1[4,5,4,5],zmm13[4,5,4,5]
11372 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm5, %zmm1
11373 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11374 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 912(%rdi), %xmm5
11375 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
11376 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm19, %zmm1
11377 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11378 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm5
11379 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],mem[2,3]
11380 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm3, %zmm1
11381 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11382 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
11383 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
11384 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm18, %zmm19
11385 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1360(%rdi), %xmm3
11386 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
11387 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm0, %zmm18
11388 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
11389 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm13
11390 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
11391 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm3 = [4,11]
11392 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
11393 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12
11394 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
11395 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm3, %zmm12
11396 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
11397 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm15, %zmm0
11398 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11399 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0
11400 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm1
11401 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11402 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
11403 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
11404 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
11405 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11406 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm11
11407 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
11408 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm2, %zmm0
11409 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11410 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0
11411 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm12
11412 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
11413 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
11414 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm11
11415 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
11416 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm11
11417 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
11418 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
11419 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11420 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0
11421 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm2
11422 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm2, (%rsp) # 32-byte Spill
11423 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
11424 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
11425 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm3
11426 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
11427 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm25, %zmm0
11428 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11429 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm7 = [5,12]
11430 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
11431 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm7, %zmm11
11432 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm30 = [6,13]
11433 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm6
11434 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11435 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3
11436 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm7, %zmm3
11437 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm5
11438 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11439 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm0
11440 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm4
11441 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm7, %zmm0
11442 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm7
11443 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm30, %zmm2
11444 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
11445 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm30, %zmm9
11446 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11447 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %ymm24
11448 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = ymm13[8,9,10,11,12,13,14,15],ymm24[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm24[16,17,18,19,20,21,22,23]
11449 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm5[4,5,6,7]
11450 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm21, %zmm0
11451 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
11452 ; AVX512DQ-BW-FCP-NEXT: movb $-32, %al
11453 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2
11454 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
11455 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11456 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2}
11457 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
11458 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11459 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2}
11460 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm2
11461 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11462 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2}
11463 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm14
11464 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11465 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2}
11466 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11467 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11468 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
11469 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11470 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
11471 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm30
11472 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11473 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1}
11474 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11475 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k2}
11476 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11477 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k1}
11478 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11479 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2}
11480 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11481 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
11482 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11483 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k2}
11484 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
11485 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm5 {%k2}
11486 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11487 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
11488 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm29 {%k2}
11489 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11490 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2}
11491 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11492 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2}
11493 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 960(%rdi), %ymm1
11494 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
11495 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
11496 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm1
11497 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1 {%k2}
11498 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm6
11499 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
11500 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
11501 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm27, %zmm6
11502 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11503 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm6 {%k2}
11504 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %ymm17
11505 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
11506 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17
11507 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm23, %zmm15
11508 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11509 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2}
11510 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %ymm17
11511 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm17 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
11512 ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17
11513 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm17, %zmm10, %zmm21
11514 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11515 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm21 {%k2}
11516 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %ymm17
11517 ; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17, %ymm13 # 32-byte Folded Reload
11518 ; AVX512DQ-BW-FCP-NEXT: # ymm13 = mem[8,9,10,11,12,13,14,15],ymm17[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm17[16,17,18,19,20,21,22,23]
11519 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm13[4,5,6,7]
11520 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11521 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm0, %zmm4
11522 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm13
11523 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm12 = ymm12[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
11524 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm12[4,5,6,7]
11525 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11526 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm0, %zmm3
11527 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm12
11528 ; AVX512DQ-BW-FCP-NEXT: vpalignr $8, (%rsp), %ymm12, %ymm11 # 32-byte Folded Reload
11529 ; AVX512DQ-BW-FCP-NEXT: # ymm11 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
11530 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm11[4,5,6,7]
11531 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11532 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm7, %zmm0
11533 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11534 ; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm11 # 32-byte Folded Reload
11535 ; AVX512DQ-BW-FCP-NEXT: # ymm11 = ymm7[0,1,2,3],mem[4,5,6,7]
11536 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11537 ; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm7, %zmm11
11538 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11539 ; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm9 # 32-byte Folded Reload
11540 ; AVX512DQ-BW-FCP-NEXT: # ymm9 = ymm7[0,1,2,3],mem[4,5,6,7]
11541 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
11542 ; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm9, %zmm7, %zmm9
11543 ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
11544 ; AVX512DQ-BW-FCP-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
11545 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm16, %zmm7
11546 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
11547 ; AVX512DQ-BW-FCP-NEXT: vblendps $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm8 # 32-byte Folded Reload
11548 ; AVX512DQ-BW-FCP-NEXT: # ymm8 = ymm8[0,1,2,3],mem[4,5,6,7]
11549 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
11550 ; AVX512DQ-BW-FCP-NEXT: vinsertf64x4 $0, %ymm8, %zmm10, %zmm8
11551 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 192(%rsi)
11552 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rsi)
11553 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 64(%rsi)
11554 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, (%rsi)
11555 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 192(%rdx)
11556 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, (%rdx)
11557 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 64(%rdx)
11558 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, 128(%rdx)
11559 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rcx)
11560 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
11561 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 64(%rcx)
11562 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%rcx)
11563 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 192(%r8)
11564 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%r8)
11565 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8)
11566 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 128(%r8)
11567 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11568 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 192(%r9)
11569 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11570 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, (%r9)
11571 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11572 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 64(%r9)
11573 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
11574 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm1, 128(%r9)
11575 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
11576 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax)
11577 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax)
11578 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax)
11579 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
11580 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rax)
11581 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
11582 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 128(%rax)
11583 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 192(%rax)
11584 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm9, (%rax)
11585 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm11, 64(%rax)
11586 ; AVX512DQ-BW-FCP-NEXT: addq $2760, %rsp # imm = 0xAC8
11587 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
11588 ; AVX512DQ-BW-FCP-NEXT: retq
11589 %wide.vec = load <224 x i64>, ptr %in.vec, align 64
11590 %strided.vec0 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105, i32 112, i32 119, i32 126, i32 133, i32 140, i32 147, i32 154, i32 161, i32 168, i32 175, i32 182, i32 189, i32 196, i32 203, i32 210, i32 217>
11591 %strided.vec1 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106, i32 113, i32 120, i32 127, i32 134, i32 141, i32 148, i32 155, i32 162, i32 169, i32 176, i32 183, i32 190, i32 197, i32 204, i32 211, i32 218>
11592 %strided.vec2 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107, i32 114, i32 121, i32 128, i32 135, i32 142, i32 149, i32 156, i32 163, i32 170, i32 177, i32 184, i32 191, i32 198, i32 205, i32 212, i32 219>
11593 %strided.vec3 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108, i32 115, i32 122, i32 129, i32 136, i32 143, i32 150, i32 157, i32 164, i32 171, i32 178, i32 185, i32 192, i32 199, i32 206, i32 213, i32 220>
11594 %strided.vec4 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109, i32 116, i32 123, i32 130, i32 137, i32 144, i32 151, i32 158, i32 165, i32 172, i32 179, i32 186, i32 193, i32 200, i32 207, i32 214, i32 221>
11595 %strided.vec5 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110, i32 117, i32 124, i32 131, i32 138, i32 145, i32 152, i32 159, i32 166, i32 173, i32 180, i32 187, i32 194, i32 201, i32 208, i32 215, i32 222>
11596 %strided.vec6 = shufflevector <224 x i64> %wide.vec, <224 x i64> poison, <32 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111, i32 118, i32 125, i32 132, i32 139, i32 146, i32 153, i32 160, i32 167, i32 174, i32 181, i32 188, i32 195, i32 202, i32 209, i32 216, i32 223>
11597 store <32 x i64> %strided.vec0, ptr %out.vec0, align 64
11598 store <32 x i64> %strided.vec1, ptr %out.vec1, align 64
11599 store <32 x i64> %strided.vec2, ptr %out.vec2, align 64
11600 store <32 x i64> %strided.vec3, ptr %out.vec3, align 64
11601 store <32 x i64> %strided.vec4, ptr %out.vec4, align 64
11602 store <32 x i64> %strided.vec5, ptr %out.vec5, align 64
11603 store <32 x i64> %strided.vec6, ptr %out.vec6, align 64
11607 define void @load_i64_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6) nounwind {
11608 ; SSE-LABEL: load_i64_stride7_vf64:
11610 ; SSE-NEXT: subq $3240, %rsp # imm = 0xCA8
11611 ; SSE-NEXT: movapd 208(%rdi), %xmm3
11612 ; SSE-NEXT: movapd 96(%rdi), %xmm2
11613 ; SSE-NEXT: movapd 144(%rdi), %xmm4
11614 ; SSE-NEXT: movapd 192(%rdi), %xmm6
11615 ; SSE-NEXT: movapd 80(%rdi), %xmm5
11616 ; SSE-NEXT: movapd 128(%rdi), %xmm8
11617 ; SSE-NEXT: movapd 176(%rdi), %xmm11
11618 ; SSE-NEXT: movapd 64(%rdi), %xmm10
11619 ; SSE-NEXT: movapd (%rdi), %xmm12
11620 ; SSE-NEXT: movapd 16(%rdi), %xmm9
11621 ; SSE-NEXT: movapd 32(%rdi), %xmm7
11622 ; SSE-NEXT: movapd 48(%rdi), %xmm0
11623 ; SSE-NEXT: movapd 224(%rdi), %xmm13
11624 ; SSE-NEXT: movapd 112(%rdi), %xmm14
11625 ; SSE-NEXT: movapd 160(%rdi), %xmm1
11626 ; SSE-NEXT: movapd %xmm0, %xmm15
11627 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm12[0],xmm15[1]
11628 ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11629 ; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm10[0]
11630 ; SSE-NEXT: movapd %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11631 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm9[0],xmm10[1]
11632 ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11633 ; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm5[0]
11634 ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11635 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm7[0],xmm5[1]
11636 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11637 ; SSE-NEXT: shufpd {{.*#+}} xmm7 = xmm7[1],xmm2[0]
11638 ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11639 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
11640 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11641 ; SSE-NEXT: movapd %xmm1, %xmm0
11642 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm14[0],xmm0[1]
11643 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11644 ; SSE-NEXT: shufpd {{.*#+}} xmm14 = xmm14[1],xmm11[0]
11645 ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11646 ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm8[0],xmm11[1]
11647 ; SSE-NEXT: movapd %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11648 ; SSE-NEXT: shufpd {{.*#+}} xmm8 = xmm8[1],xmm6[0]
11649 ; SSE-NEXT: movapd %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11650 ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm4[0],xmm6[1]
11651 ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11652 ; SSE-NEXT: shufpd {{.*#+}} xmm4 = xmm4[1],xmm3[0]
11653 ; SSE-NEXT: movapd %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11654 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm1[0],xmm3[1]
11655 ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11656 ; SSE-NEXT: movapd 272(%rdi), %xmm0
11657 ; SSE-NEXT: movapd %xmm0, %xmm1
11658 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm13[0],xmm1[1]
11659 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11660 ; SSE-NEXT: movapd 288(%rdi), %xmm1
11661 ; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm1[0]
11662 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11663 ; SSE-NEXT: movapd 240(%rdi), %xmm2
11664 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11665 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11666 ; SSE-NEXT: movapd 304(%rdi), %xmm1
11667 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11668 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11669 ; SSE-NEXT: movapd 256(%rdi), %xmm2
11670 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11671 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11672 ; SSE-NEXT: movapd 320(%rdi), %xmm1
11673 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11674 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11675 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
11676 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11677 ; SSE-NEXT: movapd 336(%rdi), %xmm2
11678 ; SSE-NEXT: movapd 384(%rdi), %xmm0
11679 ; SSE-NEXT: movapd %xmm0, %xmm1
11680 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11681 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11682 ; SSE-NEXT: movapd 400(%rdi), %xmm1
11683 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11684 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11685 ; SSE-NEXT: movapd 352(%rdi), %xmm2
11686 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11687 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11688 ; SSE-NEXT: movapd 416(%rdi), %xmm1
11689 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11690 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11691 ; SSE-NEXT: movapd 368(%rdi), %xmm2
11692 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11693 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11694 ; SSE-NEXT: movapd 432(%rdi), %xmm1
11695 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11696 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11697 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
11698 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11699 ; SSE-NEXT: movapd 448(%rdi), %xmm2
11700 ; SSE-NEXT: movapd 496(%rdi), %xmm0
11701 ; SSE-NEXT: movapd %xmm0, %xmm1
11702 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11703 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11704 ; SSE-NEXT: movapd 512(%rdi), %xmm1
11705 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11706 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11707 ; SSE-NEXT: movapd 464(%rdi), %xmm2
11708 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11709 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11710 ; SSE-NEXT: movapd 528(%rdi), %xmm1
11711 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11712 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11713 ; SSE-NEXT: movapd 480(%rdi), %xmm2
11714 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11715 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11716 ; SSE-NEXT: movapd 544(%rdi), %xmm1
11717 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11718 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11719 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
11720 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11721 ; SSE-NEXT: movapd 560(%rdi), %xmm2
11722 ; SSE-NEXT: movapd 608(%rdi), %xmm0
11723 ; SSE-NEXT: movapd %xmm0, %xmm1
11724 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11725 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11726 ; SSE-NEXT: movapd 624(%rdi), %xmm1
11727 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11728 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11729 ; SSE-NEXT: movapd 576(%rdi), %xmm2
11730 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11731 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11732 ; SSE-NEXT: movapd 640(%rdi), %xmm1
11733 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11734 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11735 ; SSE-NEXT: movapd 592(%rdi), %xmm2
11736 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11737 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11738 ; SSE-NEXT: movapd 656(%rdi), %xmm1
11739 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11740 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11741 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
11742 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11743 ; SSE-NEXT: movapd 672(%rdi), %xmm2
11744 ; SSE-NEXT: movapd 720(%rdi), %xmm0
11745 ; SSE-NEXT: movapd %xmm0, %xmm1
11746 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11747 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11748 ; SSE-NEXT: movapd 736(%rdi), %xmm1
11749 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11750 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11751 ; SSE-NEXT: movapd 688(%rdi), %xmm2
11752 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11753 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11754 ; SSE-NEXT: movapd 752(%rdi), %xmm1
11755 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11756 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11757 ; SSE-NEXT: movapd 704(%rdi), %xmm2
11758 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11759 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11760 ; SSE-NEXT: movapd 768(%rdi), %xmm1
11761 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11762 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11763 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
11764 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11765 ; SSE-NEXT: movapd 784(%rdi), %xmm2
11766 ; SSE-NEXT: movapd 832(%rdi), %xmm0
11767 ; SSE-NEXT: movapd %xmm0, %xmm1
11768 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11769 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11770 ; SSE-NEXT: movapd 848(%rdi), %xmm1
11771 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11772 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11773 ; SSE-NEXT: movapd 800(%rdi), %xmm2
11774 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11775 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11776 ; SSE-NEXT: movapd 864(%rdi), %xmm1
11777 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11778 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11779 ; SSE-NEXT: movapd 816(%rdi), %xmm2
11780 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11781 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11782 ; SSE-NEXT: movapd 880(%rdi), %xmm1
11783 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11784 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11785 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
11786 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11787 ; SSE-NEXT: movapd 896(%rdi), %xmm2
11788 ; SSE-NEXT: movapd 944(%rdi), %xmm0
11789 ; SSE-NEXT: movapd %xmm0, %xmm1
11790 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11791 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11792 ; SSE-NEXT: movapd 960(%rdi), %xmm1
11793 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11794 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11795 ; SSE-NEXT: movapd 912(%rdi), %xmm2
11796 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11797 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11798 ; SSE-NEXT: movapd 976(%rdi), %xmm1
11799 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11800 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11801 ; SSE-NEXT: movapd 928(%rdi), %xmm2
11802 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11803 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11804 ; SSE-NEXT: movapd 992(%rdi), %xmm1
11805 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11806 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11807 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
11808 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11809 ; SSE-NEXT: movapd 1008(%rdi), %xmm2
11810 ; SSE-NEXT: movapd 1056(%rdi), %xmm0
11811 ; SSE-NEXT: movapd %xmm0, %xmm1
11812 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11813 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11814 ; SSE-NEXT: movapd 1072(%rdi), %xmm1
11815 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11816 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11817 ; SSE-NEXT: movapd 1024(%rdi), %xmm2
11818 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11819 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11820 ; SSE-NEXT: movapd 1088(%rdi), %xmm1
11821 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11822 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11823 ; SSE-NEXT: movapd 1040(%rdi), %xmm2
11824 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11825 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11826 ; SSE-NEXT: movapd 1104(%rdi), %xmm1
11827 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11828 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11829 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
11830 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11831 ; SSE-NEXT: movapd 1120(%rdi), %xmm2
11832 ; SSE-NEXT: movapd 1168(%rdi), %xmm0
11833 ; SSE-NEXT: movapd %xmm0, %xmm1
11834 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11835 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11836 ; SSE-NEXT: movapd 1184(%rdi), %xmm1
11837 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11838 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11839 ; SSE-NEXT: movapd 1136(%rdi), %xmm2
11840 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11841 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11842 ; SSE-NEXT: movapd 1200(%rdi), %xmm1
11843 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11844 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11845 ; SSE-NEXT: movapd 1152(%rdi), %xmm2
11846 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11847 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11848 ; SSE-NEXT: movapd 1216(%rdi), %xmm1
11849 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11850 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11851 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
11852 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11853 ; SSE-NEXT: movapd 1232(%rdi), %xmm2
11854 ; SSE-NEXT: movapd 1280(%rdi), %xmm0
11855 ; SSE-NEXT: movapd %xmm0, %xmm1
11856 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11857 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11858 ; SSE-NEXT: movapd 1296(%rdi), %xmm1
11859 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11860 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11861 ; SSE-NEXT: movapd 1248(%rdi), %xmm2
11862 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11863 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11864 ; SSE-NEXT: movapd 1312(%rdi), %xmm1
11865 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11866 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11867 ; SSE-NEXT: movapd 1264(%rdi), %xmm2
11868 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11869 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11870 ; SSE-NEXT: movapd 1328(%rdi), %xmm1
11871 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11872 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11873 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
11874 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11875 ; SSE-NEXT: movapd 1344(%rdi), %xmm2
11876 ; SSE-NEXT: movapd 1392(%rdi), %xmm0
11877 ; SSE-NEXT: movapd %xmm0, %xmm1
11878 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11879 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11880 ; SSE-NEXT: movapd 1408(%rdi), %xmm1
11881 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11882 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11883 ; SSE-NEXT: movapd 1360(%rdi), %xmm2
11884 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11885 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11886 ; SSE-NEXT: movapd 1424(%rdi), %xmm1
11887 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11888 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11889 ; SSE-NEXT: movapd 1376(%rdi), %xmm2
11890 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11891 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11892 ; SSE-NEXT: movapd 1440(%rdi), %xmm1
11893 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11894 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11895 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
11896 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11897 ; SSE-NEXT: movapd 1456(%rdi), %xmm2
11898 ; SSE-NEXT: movapd 1504(%rdi), %xmm0
11899 ; SSE-NEXT: movapd %xmm0, %xmm1
11900 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11901 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11902 ; SSE-NEXT: movapd 1520(%rdi), %xmm1
11903 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11904 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11905 ; SSE-NEXT: movapd 1472(%rdi), %xmm2
11906 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11907 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11908 ; SSE-NEXT: movapd 1536(%rdi), %xmm1
11909 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11910 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11911 ; SSE-NEXT: movapd 1488(%rdi), %xmm2
11912 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11913 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11914 ; SSE-NEXT: movapd 1552(%rdi), %xmm1
11915 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11916 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11917 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
11918 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11919 ; SSE-NEXT: movapd 1568(%rdi), %xmm2
11920 ; SSE-NEXT: movapd 1616(%rdi), %xmm0
11921 ; SSE-NEXT: movapd %xmm0, %xmm1
11922 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11923 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11924 ; SSE-NEXT: movapd 1632(%rdi), %xmm1
11925 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11926 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11927 ; SSE-NEXT: movapd 1584(%rdi), %xmm2
11928 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11929 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11930 ; SSE-NEXT: movapd 1648(%rdi), %xmm1
11931 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11932 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11933 ; SSE-NEXT: movapd 1600(%rdi), %xmm2
11934 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11935 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11936 ; SSE-NEXT: movapd 1664(%rdi), %xmm1
11937 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11938 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11939 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
11940 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11941 ; SSE-NEXT: movapd 1680(%rdi), %xmm2
11942 ; SSE-NEXT: movapd 1728(%rdi), %xmm0
11943 ; SSE-NEXT: movapd %xmm0, %xmm1
11944 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11945 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11946 ; SSE-NEXT: movapd 1744(%rdi), %xmm1
11947 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11948 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11949 ; SSE-NEXT: movapd 1696(%rdi), %xmm2
11950 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11951 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11952 ; SSE-NEXT: movapd 1760(%rdi), %xmm1
11953 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11954 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11955 ; SSE-NEXT: movapd 1712(%rdi), %xmm2
11956 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11957 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11958 ; SSE-NEXT: movapd 1776(%rdi), %xmm1
11959 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11960 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11961 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
11962 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11963 ; SSE-NEXT: movapd 1792(%rdi), %xmm2
11964 ; SSE-NEXT: movapd 1840(%rdi), %xmm0
11965 ; SSE-NEXT: movapd %xmm0, %xmm1
11966 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11967 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11968 ; SSE-NEXT: movapd 1856(%rdi), %xmm1
11969 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11970 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11971 ; SSE-NEXT: movapd 1808(%rdi), %xmm2
11972 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11973 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11974 ; SSE-NEXT: movapd 1872(%rdi), %xmm1
11975 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11976 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11977 ; SSE-NEXT: movapd 1824(%rdi), %xmm2
11978 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11979 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11980 ; SSE-NEXT: movapd 1888(%rdi), %xmm1
11981 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11982 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11983 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
11984 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11985 ; SSE-NEXT: movapd 1904(%rdi), %xmm2
11986 ; SSE-NEXT: movapd 1952(%rdi), %xmm0
11987 ; SSE-NEXT: movapd %xmm0, %xmm1
11988 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11989 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11990 ; SSE-NEXT: movapd 1968(%rdi), %xmm1
11991 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11992 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11993 ; SSE-NEXT: movapd 1920(%rdi), %xmm2
11994 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
11995 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11996 ; SSE-NEXT: movapd 1984(%rdi), %xmm1
11997 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
11998 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11999 ; SSE-NEXT: movapd 1936(%rdi), %xmm2
12000 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12001 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12002 ; SSE-NEXT: movapd 2000(%rdi), %xmm1
12003 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12004 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12005 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
12006 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12007 ; SSE-NEXT: movapd 2016(%rdi), %xmm2
12008 ; SSE-NEXT: movapd 2064(%rdi), %xmm0
12009 ; SSE-NEXT: movapd %xmm0, %xmm1
12010 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12011 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12012 ; SSE-NEXT: movapd 2080(%rdi), %xmm1
12013 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12014 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12015 ; SSE-NEXT: movapd 2032(%rdi), %xmm2
12016 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12017 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12018 ; SSE-NEXT: movapd 2096(%rdi), %xmm1
12019 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12020 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12021 ; SSE-NEXT: movapd 2048(%rdi), %xmm2
12022 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12023 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12024 ; SSE-NEXT: movapd 2112(%rdi), %xmm1
12025 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12026 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12027 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
12028 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12029 ; SSE-NEXT: movapd 2128(%rdi), %xmm2
12030 ; SSE-NEXT: movapd 2176(%rdi), %xmm0
12031 ; SSE-NEXT: movapd %xmm0, %xmm1
12032 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12033 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12034 ; SSE-NEXT: movapd 2192(%rdi), %xmm1
12035 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12036 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12037 ; SSE-NEXT: movapd 2144(%rdi), %xmm2
12038 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12039 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12040 ; SSE-NEXT: movapd 2208(%rdi), %xmm1
12041 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12042 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12043 ; SSE-NEXT: movapd 2160(%rdi), %xmm2
12044 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12045 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12046 ; SSE-NEXT: movapd 2224(%rdi), %xmm1
12047 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12048 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12049 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
12050 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12051 ; SSE-NEXT: movapd 2240(%rdi), %xmm2
12052 ; SSE-NEXT: movapd 2288(%rdi), %xmm0
12053 ; SSE-NEXT: movapd %xmm0, %xmm1
12054 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12055 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12056 ; SSE-NEXT: movapd 2304(%rdi), %xmm1
12057 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12058 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12059 ; SSE-NEXT: movapd 2256(%rdi), %xmm2
12060 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12061 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12062 ; SSE-NEXT: movapd 2320(%rdi), %xmm1
12063 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12064 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12065 ; SSE-NEXT: movapd 2272(%rdi), %xmm2
12066 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12067 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12068 ; SSE-NEXT: movapd 2336(%rdi), %xmm1
12069 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12070 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12071 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
12072 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12073 ; SSE-NEXT: movapd 2352(%rdi), %xmm2
12074 ; SSE-NEXT: movapd 2400(%rdi), %xmm0
12075 ; SSE-NEXT: movapd %xmm0, %xmm1
12076 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12077 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12078 ; SSE-NEXT: movapd 2416(%rdi), %xmm1
12079 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12080 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12081 ; SSE-NEXT: movapd 2368(%rdi), %xmm2
12082 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12083 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12084 ; SSE-NEXT: movapd 2432(%rdi), %xmm1
12085 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12086 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12087 ; SSE-NEXT: movapd 2384(%rdi), %xmm2
12088 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12089 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12090 ; SSE-NEXT: movapd 2448(%rdi), %xmm1
12091 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12092 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12093 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
12094 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12095 ; SSE-NEXT: movapd 2464(%rdi), %xmm2
12096 ; SSE-NEXT: movapd 2512(%rdi), %xmm0
12097 ; SSE-NEXT: movapd %xmm0, %xmm1
12098 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12099 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12100 ; SSE-NEXT: movapd 2528(%rdi), %xmm1
12101 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12102 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12103 ; SSE-NEXT: movapd 2480(%rdi), %xmm2
12104 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12105 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12106 ; SSE-NEXT: movapd 2544(%rdi), %xmm1
12107 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12108 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12109 ; SSE-NEXT: movapd 2496(%rdi), %xmm2
12110 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12111 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12112 ; SSE-NEXT: movapd 2560(%rdi), %xmm1
12113 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12114 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12115 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
12116 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12117 ; SSE-NEXT: movapd 2576(%rdi), %xmm2
12118 ; SSE-NEXT: movapd 2624(%rdi), %xmm0
12119 ; SSE-NEXT: movapd %xmm0, %xmm1
12120 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12121 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12122 ; SSE-NEXT: movapd 2640(%rdi), %xmm1
12123 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12124 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12125 ; SSE-NEXT: movapd 2592(%rdi), %xmm2
12126 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12127 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12128 ; SSE-NEXT: movapd 2656(%rdi), %xmm1
12129 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12130 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12131 ; SSE-NEXT: movapd 2608(%rdi), %xmm2
12132 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12133 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12134 ; SSE-NEXT: movapd 2672(%rdi), %xmm1
12135 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12136 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12137 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
12138 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12139 ; SSE-NEXT: movapd 2688(%rdi), %xmm2
12140 ; SSE-NEXT: movapd 2736(%rdi), %xmm0
12141 ; SSE-NEXT: movapd %xmm0, %xmm1
12142 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12143 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12144 ; SSE-NEXT: movapd 2752(%rdi), %xmm1
12145 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12146 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12147 ; SSE-NEXT: movapd 2704(%rdi), %xmm2
12148 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12149 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12150 ; SSE-NEXT: movapd 2768(%rdi), %xmm1
12151 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12152 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12153 ; SSE-NEXT: movapd 2720(%rdi), %xmm2
12154 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12155 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12156 ; SSE-NEXT: movapd 2784(%rdi), %xmm1
12157 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12158 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12159 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
12160 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12161 ; SSE-NEXT: movapd 2800(%rdi), %xmm2
12162 ; SSE-NEXT: movapd 2848(%rdi), %xmm0
12163 ; SSE-NEXT: movapd %xmm0, %xmm1
12164 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12165 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12166 ; SSE-NEXT: movapd 2864(%rdi), %xmm1
12167 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12168 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12169 ; SSE-NEXT: movapd 2816(%rdi), %xmm2
12170 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12171 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12172 ; SSE-NEXT: movapd 2880(%rdi), %xmm1
12173 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12174 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12175 ; SSE-NEXT: movapd 2832(%rdi), %xmm2
12176 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12177 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12178 ; SSE-NEXT: movapd 2896(%rdi), %xmm1
12179 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12180 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12181 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
12182 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12183 ; SSE-NEXT: movapd 2912(%rdi), %xmm2
12184 ; SSE-NEXT: movapd 2960(%rdi), %xmm0
12185 ; SSE-NEXT: movapd %xmm0, %xmm1
12186 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12187 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12188 ; SSE-NEXT: movapd 2976(%rdi), %xmm1
12189 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12190 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12191 ; SSE-NEXT: movapd 2928(%rdi), %xmm2
12192 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12193 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12194 ; SSE-NEXT: movapd 2992(%rdi), %xmm1
12195 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12196 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12197 ; SSE-NEXT: movapd 2944(%rdi), %xmm2
12198 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12199 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12200 ; SSE-NEXT: movapd 3008(%rdi), %xmm1
12201 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12202 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12203 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
12204 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12205 ; SSE-NEXT: movapd 3024(%rdi), %xmm2
12206 ; SSE-NEXT: movapd 3072(%rdi), %xmm0
12207 ; SSE-NEXT: movapd %xmm0, %xmm14
12208 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1]
12209 ; SSE-NEXT: movapd 3088(%rdi), %xmm1
12210 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12211 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12212 ; SSE-NEXT: movapd 3040(%rdi), %xmm2
12213 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12214 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12215 ; SSE-NEXT: movapd 3104(%rdi), %xmm1
12216 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12217 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12218 ; SSE-NEXT: movapd 3056(%rdi), %xmm2
12219 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12220 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12221 ; SSE-NEXT: movapd 3120(%rdi), %xmm1
12222 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12223 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12224 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
12225 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12226 ; SSE-NEXT: movapd 3136(%rdi), %xmm12
12227 ; SSE-NEXT: movapd 3184(%rdi), %xmm0
12228 ; SSE-NEXT: movapd %xmm0, %xmm10
12229 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm12[0],xmm10[1]
12230 ; SSE-NEXT: movapd 3200(%rdi), %xmm1
12231 ; SSE-NEXT: shufpd {{.*#+}} xmm12 = xmm12[1],xmm1[0]
12232 ; SSE-NEXT: movapd 3152(%rdi), %xmm2
12233 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12234 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12235 ; SSE-NEXT: movapd 3216(%rdi), %xmm1
12236 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12237 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12238 ; SSE-NEXT: movapd 3168(%rdi), %xmm2
12239 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12240 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12241 ; SSE-NEXT: movapd 3232(%rdi), %xmm1
12242 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12243 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12244 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
12245 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12246 ; SSE-NEXT: movapd 3248(%rdi), %xmm9
12247 ; SSE-NEXT: movapd 3296(%rdi), %xmm0
12248 ; SSE-NEXT: movapd %xmm0, %xmm7
12249 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm9[0],xmm7[1]
12250 ; SSE-NEXT: movapd 3312(%rdi), %xmm15
12251 ; SSE-NEXT: shufpd {{.*#+}} xmm9 = xmm9[1],xmm15[0]
12252 ; SSE-NEXT: movapd 3264(%rdi), %xmm2
12253 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm2[0],xmm15[1]
12254 ; SSE-NEXT: movapd 3328(%rdi), %xmm1
12255 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12256 ; SSE-NEXT: movapd %xmm2, (%rsp) # 16-byte Spill
12257 ; SSE-NEXT: movapd 3280(%rdi), %xmm2
12258 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12259 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12260 ; SSE-NEXT: movapd 3344(%rdi), %xmm1
12261 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12262 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12263 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
12264 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12265 ; SSE-NEXT: movapd 3360(%rdi), %xmm6
12266 ; SSE-NEXT: movapd 3408(%rdi), %xmm0
12267 ; SSE-NEXT: movapd %xmm0, %xmm4
12268 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1]
12269 ; SSE-NEXT: movapd 3424(%rdi), %xmm11
12270 ; SSE-NEXT: shufpd {{.*#+}} xmm6 = xmm6[1],xmm11[0]
12271 ; SSE-NEXT: movapd 3376(%rdi), %xmm2
12272 ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm2[0],xmm11[1]
12273 ; SSE-NEXT: movapd 3440(%rdi), %xmm1
12274 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12275 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12276 ; SSE-NEXT: movapd 3392(%rdi), %xmm2
12277 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12278 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12279 ; SSE-NEXT: movapd 3456(%rdi), %xmm1
12280 ; SSE-NEXT: shufpd {{.*#+}} xmm2 = xmm2[1],xmm1[0]
12281 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12282 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
12283 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12284 ; SSE-NEXT: movapd 3472(%rdi), %xmm5
12285 ; SSE-NEXT: movapd 3520(%rdi), %xmm3
12286 ; SSE-NEXT: movapd %xmm3, %xmm2
12287 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm5[0],xmm2[1]
12288 ; SSE-NEXT: movapd 3536(%rdi), %xmm8
12289 ; SSE-NEXT: shufpd {{.*#+}} xmm5 = xmm5[1],xmm8[0]
12290 ; SSE-NEXT: movapd 3488(%rdi), %xmm13
12291 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm13[0],xmm8[1]
12292 ; SSE-NEXT: movapd 3552(%rdi), %xmm0
12293 ; SSE-NEXT: shufpd {{.*#+}} xmm13 = xmm13[1],xmm0[0]
12294 ; SSE-NEXT: movapd 3504(%rdi), %xmm1
12295 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
12296 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12297 ; SSE-NEXT: movapd %xmm1, %xmm0
12298 ; SSE-NEXT: movapd 3568(%rdi), %xmm1
12299 ; SSE-NEXT: shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
12300 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12301 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm3[0],xmm1[1]
12302 ; SSE-NEXT: movapd %xmm2, 496(%rsi)
12303 ; SSE-NEXT: movapd %xmm4, 480(%rsi)
12304 ; SSE-NEXT: movapd %xmm7, 464(%rsi)
12305 ; SSE-NEXT: movapd %xmm10, 448(%rsi)
12306 ; SSE-NEXT: movapd %xmm14, 432(%rsi)
12307 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12308 ; SSE-NEXT: movaps %xmm0, 416(%rsi)
12309 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12310 ; SSE-NEXT: movaps %xmm0, 400(%rsi)
12311 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12312 ; SSE-NEXT: movaps %xmm0, 384(%rsi)
12313 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12314 ; SSE-NEXT: movaps %xmm0, 368(%rsi)
12315 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12316 ; SSE-NEXT: movaps %xmm0, 352(%rsi)
12317 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12318 ; SSE-NEXT: movaps %xmm0, 336(%rsi)
12319 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12320 ; SSE-NEXT: movaps %xmm0, 320(%rsi)
12321 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12322 ; SSE-NEXT: movaps %xmm0, 304(%rsi)
12323 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12324 ; SSE-NEXT: movaps %xmm0, 288(%rsi)
12325 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12326 ; SSE-NEXT: movaps %xmm0, 272(%rsi)
12327 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12328 ; SSE-NEXT: movaps %xmm0, 256(%rsi)
12329 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12330 ; SSE-NEXT: movaps %xmm0, 240(%rsi)
12331 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12332 ; SSE-NEXT: movaps %xmm0, 224(%rsi)
12333 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12334 ; SSE-NEXT: movaps %xmm0, 208(%rsi)
12335 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12336 ; SSE-NEXT: movaps %xmm0, 192(%rsi)
12337 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12338 ; SSE-NEXT: movaps %xmm0, 176(%rsi)
12339 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12340 ; SSE-NEXT: movaps %xmm0, 160(%rsi)
12341 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12342 ; SSE-NEXT: movaps %xmm0, 144(%rsi)
12343 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12344 ; SSE-NEXT: movaps %xmm0, 128(%rsi)
12345 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12346 ; SSE-NEXT: movaps %xmm0, 112(%rsi)
12347 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12348 ; SSE-NEXT: movaps %xmm0, 96(%rsi)
12349 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12350 ; SSE-NEXT: movaps %xmm0, 80(%rsi)
12351 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12352 ; SSE-NEXT: movaps %xmm0, 64(%rsi)
12353 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12354 ; SSE-NEXT: movaps %xmm0, 48(%rsi)
12355 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12356 ; SSE-NEXT: movaps %xmm0, 32(%rsi)
12357 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12358 ; SSE-NEXT: movaps %xmm0, 16(%rsi)
12359 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12360 ; SSE-NEXT: movaps %xmm0, (%rsi)
12361 ; SSE-NEXT: movapd %xmm5, 496(%rdx)
12362 ; SSE-NEXT: movapd %xmm6, 480(%rdx)
12363 ; SSE-NEXT: movapd %xmm9, 464(%rdx)
12364 ; SSE-NEXT: movapd %xmm12, 448(%rdx)
12365 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12366 ; SSE-NEXT: movaps %xmm0, 432(%rdx)
12367 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12368 ; SSE-NEXT: movaps %xmm0, 416(%rdx)
12369 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12370 ; SSE-NEXT: movaps %xmm0, 400(%rdx)
12371 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12372 ; SSE-NEXT: movaps %xmm0, 384(%rdx)
12373 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12374 ; SSE-NEXT: movaps %xmm0, 368(%rdx)
12375 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12376 ; SSE-NEXT: movaps %xmm0, 352(%rdx)
12377 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12378 ; SSE-NEXT: movaps %xmm0, 336(%rdx)
12379 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12380 ; SSE-NEXT: movaps %xmm0, 320(%rdx)
12381 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12382 ; SSE-NEXT: movaps %xmm0, 304(%rdx)
12383 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12384 ; SSE-NEXT: movaps %xmm0, 288(%rdx)
12385 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12386 ; SSE-NEXT: movaps %xmm0, 272(%rdx)
12387 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12388 ; SSE-NEXT: movaps %xmm0, 256(%rdx)
12389 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12390 ; SSE-NEXT: movaps %xmm0, 240(%rdx)
12391 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12392 ; SSE-NEXT: movaps %xmm0, 224(%rdx)
12393 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12394 ; SSE-NEXT: movaps %xmm0, 208(%rdx)
12395 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12396 ; SSE-NEXT: movaps %xmm0, 192(%rdx)
12397 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12398 ; SSE-NEXT: movaps %xmm0, 176(%rdx)
12399 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12400 ; SSE-NEXT: movaps %xmm0, 160(%rdx)
12401 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12402 ; SSE-NEXT: movaps %xmm0, 144(%rdx)
12403 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12404 ; SSE-NEXT: movaps %xmm0, 128(%rdx)
12405 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12406 ; SSE-NEXT: movaps %xmm0, 112(%rdx)
12407 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12408 ; SSE-NEXT: movaps %xmm0, 96(%rdx)
12409 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12410 ; SSE-NEXT: movaps %xmm0, 80(%rdx)
12411 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12412 ; SSE-NEXT: movaps %xmm0, 64(%rdx)
12413 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12414 ; SSE-NEXT: movaps %xmm0, 48(%rdx)
12415 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12416 ; SSE-NEXT: movaps %xmm0, 32(%rdx)
12417 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12418 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
12419 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12420 ; SSE-NEXT: movaps %xmm0, (%rdx)
12421 ; SSE-NEXT: movapd %xmm8, 496(%rcx)
12422 ; SSE-NEXT: movapd %xmm11, 480(%rcx)
12423 ; SSE-NEXT: movapd %xmm15, 464(%rcx)
12424 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12425 ; SSE-NEXT: movaps %xmm0, 448(%rcx)
12426 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12427 ; SSE-NEXT: movaps %xmm0, 432(%rcx)
12428 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12429 ; SSE-NEXT: movaps %xmm0, 416(%rcx)
12430 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12431 ; SSE-NEXT: movaps %xmm0, 400(%rcx)
12432 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12433 ; SSE-NEXT: movaps %xmm0, 384(%rcx)
12434 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12435 ; SSE-NEXT: movaps %xmm0, 368(%rcx)
12436 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12437 ; SSE-NEXT: movaps %xmm0, 352(%rcx)
12438 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12439 ; SSE-NEXT: movaps %xmm0, 336(%rcx)
12440 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12441 ; SSE-NEXT: movaps %xmm0, 320(%rcx)
12442 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12443 ; SSE-NEXT: movaps %xmm0, 304(%rcx)
12444 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12445 ; SSE-NEXT: movaps %xmm0, 288(%rcx)
12446 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12447 ; SSE-NEXT: movaps %xmm0, 272(%rcx)
12448 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12449 ; SSE-NEXT: movaps %xmm0, 256(%rcx)
12450 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12451 ; SSE-NEXT: movaps %xmm0, 240(%rcx)
12452 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12453 ; SSE-NEXT: movaps %xmm0, 224(%rcx)
12454 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12455 ; SSE-NEXT: movaps %xmm0, 208(%rcx)
12456 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12457 ; SSE-NEXT: movaps %xmm0, 192(%rcx)
12458 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12459 ; SSE-NEXT: movaps %xmm0, 176(%rcx)
12460 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12461 ; SSE-NEXT: movaps %xmm0, 160(%rcx)
12462 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12463 ; SSE-NEXT: movaps %xmm0, 144(%rcx)
12464 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12465 ; SSE-NEXT: movaps %xmm0, 128(%rcx)
12466 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12467 ; SSE-NEXT: movaps %xmm0, 112(%rcx)
12468 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12469 ; SSE-NEXT: movaps %xmm0, 96(%rcx)
12470 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12471 ; SSE-NEXT: movaps %xmm0, 80(%rcx)
12472 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12473 ; SSE-NEXT: movaps %xmm0, 64(%rcx)
12474 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12475 ; SSE-NEXT: movaps %xmm0, 48(%rcx)
12476 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12477 ; SSE-NEXT: movaps %xmm0, 32(%rcx)
12478 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12479 ; SSE-NEXT: movaps %xmm0, 16(%rcx)
12480 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12481 ; SSE-NEXT: movaps %xmm0, (%rcx)
12482 ; SSE-NEXT: movapd %xmm13, 496(%r8)
12483 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12484 ; SSE-NEXT: movaps %xmm0, 480(%r8)
12485 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
12486 ; SSE-NEXT: movaps %xmm0, 464(%r8)
12487 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12488 ; SSE-NEXT: movaps %xmm0, 448(%r8)
12489 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12490 ; SSE-NEXT: movaps %xmm0, 432(%r8)
12491 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12492 ; SSE-NEXT: movaps %xmm0, 416(%r8)
12493 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12494 ; SSE-NEXT: movaps %xmm0, 400(%r8)
12495 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12496 ; SSE-NEXT: movaps %xmm0, 384(%r8)
12497 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12498 ; SSE-NEXT: movaps %xmm0, 368(%r8)
12499 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12500 ; SSE-NEXT: movaps %xmm0, 352(%r8)
12501 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12502 ; SSE-NEXT: movaps %xmm0, 336(%r8)
12503 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12504 ; SSE-NEXT: movaps %xmm0, 320(%r8)
12505 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12506 ; SSE-NEXT: movaps %xmm0, 304(%r8)
12507 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12508 ; SSE-NEXT: movaps %xmm0, 288(%r8)
12509 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12510 ; SSE-NEXT: movaps %xmm0, 272(%r8)
12511 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12512 ; SSE-NEXT: movaps %xmm0, 256(%r8)
12513 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12514 ; SSE-NEXT: movaps %xmm0, 240(%r8)
12515 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12516 ; SSE-NEXT: movaps %xmm0, 224(%r8)
12517 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12518 ; SSE-NEXT: movaps %xmm0, 208(%r8)
12519 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12520 ; SSE-NEXT: movaps %xmm0, 192(%r8)
12521 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12522 ; SSE-NEXT: movaps %xmm0, 176(%r8)
12523 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12524 ; SSE-NEXT: movaps %xmm0, 160(%r8)
12525 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12526 ; SSE-NEXT: movaps %xmm0, 144(%r8)
12527 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12528 ; SSE-NEXT: movaps %xmm0, 128(%r8)
12529 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12530 ; SSE-NEXT: movaps %xmm0, 112(%r8)
12531 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12532 ; SSE-NEXT: movaps %xmm0, 96(%r8)
12533 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12534 ; SSE-NEXT: movaps %xmm0, 80(%r8)
12535 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12536 ; SSE-NEXT: movaps %xmm0, 64(%r8)
12537 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12538 ; SSE-NEXT: movaps %xmm0, 48(%r8)
12539 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12540 ; SSE-NEXT: movaps %xmm0, 32(%r8)
12541 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12542 ; SSE-NEXT: movaps %xmm0, 16(%r8)
12543 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12544 ; SSE-NEXT: movaps %xmm0, (%r8)
12545 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12546 ; SSE-NEXT: movaps %xmm0, 496(%r9)
12547 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12548 ; SSE-NEXT: movaps %xmm0, 480(%r9)
12549 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12550 ; SSE-NEXT: movaps %xmm0, 464(%r9)
12551 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12552 ; SSE-NEXT: movaps %xmm0, 448(%r9)
12553 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12554 ; SSE-NEXT: movaps %xmm0, 432(%r9)
12555 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12556 ; SSE-NEXT: movaps %xmm0, 416(%r9)
12557 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12558 ; SSE-NEXT: movaps %xmm0, 400(%r9)
12559 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12560 ; SSE-NEXT: movaps %xmm0, 384(%r9)
12561 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12562 ; SSE-NEXT: movaps %xmm0, 368(%r9)
12563 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12564 ; SSE-NEXT: movaps %xmm0, 352(%r9)
12565 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12566 ; SSE-NEXT: movaps %xmm0, 336(%r9)
12567 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12568 ; SSE-NEXT: movaps %xmm0, 320(%r9)
12569 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12570 ; SSE-NEXT: movaps %xmm0, 304(%r9)
12571 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12572 ; SSE-NEXT: movaps %xmm0, 288(%r9)
12573 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12574 ; SSE-NEXT: movaps %xmm0, 272(%r9)
12575 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12576 ; SSE-NEXT: movaps %xmm0, 256(%r9)
12577 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12578 ; SSE-NEXT: movaps %xmm0, 240(%r9)
12579 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12580 ; SSE-NEXT: movaps %xmm0, 224(%r9)
12581 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12582 ; SSE-NEXT: movaps %xmm0, 208(%r9)
12583 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12584 ; SSE-NEXT: movaps %xmm0, 192(%r9)
12585 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12586 ; SSE-NEXT: movaps %xmm0, 176(%r9)
12587 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12588 ; SSE-NEXT: movaps %xmm0, 160(%r9)
12589 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12590 ; SSE-NEXT: movaps %xmm0, 144(%r9)
12591 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12592 ; SSE-NEXT: movaps %xmm0, 128(%r9)
12593 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12594 ; SSE-NEXT: movaps %xmm0, 112(%r9)
12595 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12596 ; SSE-NEXT: movaps %xmm0, 96(%r9)
12597 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12598 ; SSE-NEXT: movaps %xmm0, 80(%r9)
12599 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12600 ; SSE-NEXT: movaps %xmm0, 64(%r9)
12601 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12602 ; SSE-NEXT: movaps %xmm0, 48(%r9)
12603 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12604 ; SSE-NEXT: movaps %xmm0, 32(%r9)
12605 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12606 ; SSE-NEXT: movaps %xmm0, 16(%r9)
12607 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12608 ; SSE-NEXT: movaps %xmm0, (%r9)
12609 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
12610 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12611 ; SSE-NEXT: movaps %xmm0, 496(%rax)
12612 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12613 ; SSE-NEXT: movaps %xmm0, 480(%rax)
12614 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12615 ; SSE-NEXT: movaps %xmm0, 464(%rax)
12616 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12617 ; SSE-NEXT: movaps %xmm0, 448(%rax)
12618 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12619 ; SSE-NEXT: movaps %xmm0, 432(%rax)
12620 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12621 ; SSE-NEXT: movaps %xmm0, 416(%rax)
12622 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12623 ; SSE-NEXT: movaps %xmm0, 400(%rax)
12624 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12625 ; SSE-NEXT: movaps %xmm0, 384(%rax)
12626 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12627 ; SSE-NEXT: movaps %xmm0, 368(%rax)
12628 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12629 ; SSE-NEXT: movaps %xmm0, 352(%rax)
12630 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12631 ; SSE-NEXT: movaps %xmm0, 336(%rax)
12632 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12633 ; SSE-NEXT: movaps %xmm0, 320(%rax)
12634 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12635 ; SSE-NEXT: movaps %xmm0, 304(%rax)
12636 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12637 ; SSE-NEXT: movaps %xmm0, 288(%rax)
12638 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12639 ; SSE-NEXT: movaps %xmm0, 272(%rax)
12640 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12641 ; SSE-NEXT: movaps %xmm0, 256(%rax)
12642 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12643 ; SSE-NEXT: movaps %xmm0, 240(%rax)
12644 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12645 ; SSE-NEXT: movaps %xmm0, 224(%rax)
12646 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12647 ; SSE-NEXT: movaps %xmm0, 208(%rax)
12648 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12649 ; SSE-NEXT: movaps %xmm0, 192(%rax)
12650 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12651 ; SSE-NEXT: movaps %xmm0, 176(%rax)
12652 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12653 ; SSE-NEXT: movaps %xmm0, 160(%rax)
12654 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12655 ; SSE-NEXT: movaps %xmm0, 144(%rax)
12656 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12657 ; SSE-NEXT: movaps %xmm0, 128(%rax)
12658 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12659 ; SSE-NEXT: movaps %xmm0, 112(%rax)
12660 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12661 ; SSE-NEXT: movaps %xmm0, 96(%rax)
12662 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12663 ; SSE-NEXT: movaps %xmm0, 80(%rax)
12664 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12665 ; SSE-NEXT: movaps %xmm0, 64(%rax)
12666 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12667 ; SSE-NEXT: movaps %xmm0, 48(%rax)
12668 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12669 ; SSE-NEXT: movaps %xmm0, 32(%rax)
12670 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12671 ; SSE-NEXT: movaps %xmm0, 16(%rax)
12672 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12673 ; SSE-NEXT: movaps %xmm0, (%rax)
12674 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
12675 ; SSE-NEXT: movapd %xmm1, 496(%rax)
12676 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12677 ; SSE-NEXT: movaps %xmm0, 480(%rax)
12678 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12679 ; SSE-NEXT: movaps %xmm0, 464(%rax)
12680 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12681 ; SSE-NEXT: movaps %xmm0, 448(%rax)
12682 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12683 ; SSE-NEXT: movaps %xmm0, 432(%rax)
12684 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12685 ; SSE-NEXT: movaps %xmm0, 416(%rax)
12686 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12687 ; SSE-NEXT: movaps %xmm0, 400(%rax)
12688 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12689 ; SSE-NEXT: movaps %xmm0, 384(%rax)
12690 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12691 ; SSE-NEXT: movaps %xmm0, 368(%rax)
12692 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12693 ; SSE-NEXT: movaps %xmm0, 352(%rax)
12694 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12695 ; SSE-NEXT: movaps %xmm0, 336(%rax)
12696 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12697 ; SSE-NEXT: movaps %xmm0, 320(%rax)
12698 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12699 ; SSE-NEXT: movaps %xmm0, 304(%rax)
12700 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12701 ; SSE-NEXT: movaps %xmm0, 288(%rax)
12702 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12703 ; SSE-NEXT: movaps %xmm0, 272(%rax)
12704 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12705 ; SSE-NEXT: movaps %xmm0, 256(%rax)
12706 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12707 ; SSE-NEXT: movaps %xmm0, 240(%rax)
12708 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12709 ; SSE-NEXT: movaps %xmm0, 224(%rax)
12710 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12711 ; SSE-NEXT: movaps %xmm0, 208(%rax)
12712 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12713 ; SSE-NEXT: movaps %xmm0, 192(%rax)
12714 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12715 ; SSE-NEXT: movaps %xmm0, 176(%rax)
12716 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12717 ; SSE-NEXT: movaps %xmm0, 160(%rax)
12718 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12719 ; SSE-NEXT: movaps %xmm0, 144(%rax)
12720 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12721 ; SSE-NEXT: movaps %xmm0, 128(%rax)
12722 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12723 ; SSE-NEXT: movaps %xmm0, 112(%rax)
12724 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12725 ; SSE-NEXT: movaps %xmm0, 96(%rax)
12726 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12727 ; SSE-NEXT: movaps %xmm0, 80(%rax)
12728 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12729 ; SSE-NEXT: movaps %xmm0, 64(%rax)
12730 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12731 ; SSE-NEXT: movaps %xmm0, 48(%rax)
12732 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12733 ; SSE-NEXT: movaps %xmm0, 32(%rax)
12734 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12735 ; SSE-NEXT: movaps %xmm0, 16(%rax)
12736 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12737 ; SSE-NEXT: movaps %xmm0, (%rax)
12738 ; SSE-NEXT: addq $3240, %rsp # imm = 0xCA8
12741 ; AVX-LABEL: load_i64_stride7_vf64:
12743 ; AVX-NEXT: subq $4232, %rsp # imm = 0x1088
12744 ; AVX-NEXT: vmovaps 1216(%rdi), %ymm3
12745 ; AVX-NEXT: vmovaps 768(%rdi), %ymm4
12746 ; AVX-NEXT: vmovaps 320(%rdi), %ymm5
12747 ; AVX-NEXT: vinsertf128 $1, 384(%rdi), %ymm0, %ymm0
12748 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12749 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
12750 ; AVX-NEXT: vmovaps 224(%rdi), %xmm10
12751 ; AVX-NEXT: vmovaps 272(%rdi), %xmm1
12752 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12753 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
12754 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12755 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12756 ; AVX-NEXT: vinsertf128 $1, 832(%rdi), %ymm0, %ymm0
12757 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12758 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
12759 ; AVX-NEXT: vmovaps 672(%rdi), %xmm11
12760 ; AVX-NEXT: vmovaps 720(%rdi), %xmm1
12761 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12762 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3]
12763 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12764 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12765 ; AVX-NEXT: vinsertf128 $1, 1280(%rdi), %ymm0, %ymm0
12766 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12767 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
12768 ; AVX-NEXT: vmovaps 1120(%rdi), %xmm12
12769 ; AVX-NEXT: vmovaps 1168(%rdi), %xmm1
12770 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12771 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3]
12772 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12773 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12774 ; AVX-NEXT: vmovapd 1664(%rdi), %ymm6
12775 ; AVX-NEXT: vinsertf128 $1, 1728(%rdi), %ymm0, %ymm0
12776 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12777 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3]
12778 ; AVX-NEXT: vmovapd 1568(%rdi), %xmm2
12779 ; AVX-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12780 ; AVX-NEXT: vmovapd 1616(%rdi), %xmm1
12781 ; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12782 ; AVX-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12783 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
12784 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12785 ; AVX-NEXT: vmovapd 2112(%rdi), %ymm7
12786 ; AVX-NEXT: vinsertf128 $1, 2176(%rdi), %ymm0, %ymm0
12787 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12788 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1,2],ymm0[3]
12789 ; AVX-NEXT: vmovapd 2016(%rdi), %xmm2
12790 ; AVX-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12791 ; AVX-NEXT: vmovapd 2064(%rdi), %xmm1
12792 ; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12793 ; AVX-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12794 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
12795 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12796 ; AVX-NEXT: vmovapd 2560(%rdi), %ymm8
12797 ; AVX-NEXT: vinsertf128 $1, 2624(%rdi), %ymm0, %ymm0
12798 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12799 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm8[0,1,2],ymm0[3]
12800 ; AVX-NEXT: vmovapd 2464(%rdi), %xmm2
12801 ; AVX-NEXT: vmovapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12802 ; AVX-NEXT: vmovapd 2512(%rdi), %xmm1
12803 ; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12804 ; AVX-NEXT: vblendpd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
12805 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
12806 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12807 ; AVX-NEXT: vmovaps 3008(%rdi), %ymm1
12808 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12809 ; AVX-NEXT: vinsertf128 $1, 3072(%rdi), %ymm0, %ymm0
12810 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12811 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12812 ; AVX-NEXT: vmovaps 2912(%rdi), %xmm0
12813 ; AVX-NEXT: vmovaps 2960(%rdi), %xmm2
12814 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12815 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3]
12816 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
12817 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12818 ; AVX-NEXT: vmovaps 3456(%rdi), %ymm2
12819 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12820 ; AVX-NEXT: vinsertf128 $1, 3520(%rdi), %ymm0, %ymm1
12821 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12822 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12823 ; AVX-NEXT: vmovaps 3360(%rdi), %xmm15
12824 ; AVX-NEXT: vmovaps 3408(%rdi), %xmm2
12825 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12826 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3]
12827 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
12828 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12829 ; AVX-NEXT: vmovaps 96(%rdi), %ymm2
12830 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12831 ; AVX-NEXT: vinsertf128 $1, 160(%rdi), %ymm0, %ymm1
12832 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12833 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12834 ; AVX-NEXT: vmovaps 48(%rdi), %xmm2
12835 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12836 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = mem[0,1],xmm2[2,3]
12837 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
12838 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12839 ; AVX-NEXT: vmovaps 544(%rdi), %ymm2
12840 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12841 ; AVX-NEXT: vinsertf128 $1, 608(%rdi), %ymm0, %ymm1
12842 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12843 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12844 ; AVX-NEXT: vmovaps 448(%rdi), %xmm9
12845 ; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12846 ; AVX-NEXT: vmovaps 496(%rdi), %xmm2
12847 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12848 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
12849 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
12850 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12851 ; AVX-NEXT: vmovaps 992(%rdi), %ymm2
12852 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12853 ; AVX-NEXT: vinsertf128 $1, 1056(%rdi), %ymm0, %ymm1
12854 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12855 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12856 ; AVX-NEXT: vmovaps 896(%rdi), %xmm9
12857 ; AVX-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12858 ; AVX-NEXT: vmovaps 944(%rdi), %xmm2
12859 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12860 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
12861 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
12862 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12863 ; AVX-NEXT: vmovaps 1440(%rdi), %ymm2
12864 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12865 ; AVX-NEXT: vinsertf128 $1, 1504(%rdi), %ymm0, %ymm1
12866 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12867 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12868 ; AVX-NEXT: vmovaps 1344(%rdi), %xmm9
12869 ; AVX-NEXT: vmovaps 1392(%rdi), %xmm2
12870 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12871 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
12872 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
12873 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12874 ; AVX-NEXT: vmovaps 1888(%rdi), %ymm2
12875 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12876 ; AVX-NEXT: vinsertf128 $1, 1952(%rdi), %ymm0, %ymm1
12877 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12878 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12879 ; AVX-NEXT: vmovaps 1792(%rdi), %xmm13
12880 ; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12881 ; AVX-NEXT: vmovaps 1840(%rdi), %xmm2
12882 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12883 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
12884 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
12885 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12886 ; AVX-NEXT: vmovaps 2336(%rdi), %ymm2
12887 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12888 ; AVX-NEXT: vinsertf128 $1, 2400(%rdi), %ymm0, %ymm1
12889 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12890 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12891 ; AVX-NEXT: vmovaps 2240(%rdi), %xmm13
12892 ; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12893 ; AVX-NEXT: vmovaps 2288(%rdi), %xmm2
12894 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12895 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
12896 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
12897 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12898 ; AVX-NEXT: vmovaps 2784(%rdi), %ymm2
12899 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12900 ; AVX-NEXT: vinsertf128 $1, 2848(%rdi), %ymm0, %ymm1
12901 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12902 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12903 ; AVX-NEXT: vmovaps 2688(%rdi), %xmm13
12904 ; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12905 ; AVX-NEXT: vmovaps 2736(%rdi), %xmm2
12906 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12907 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
12908 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
12909 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12910 ; AVX-NEXT: vmovaps 3232(%rdi), %ymm2
12911 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12912 ; AVX-NEXT: vinsertf128 $1, 3296(%rdi), %ymm0, %ymm1
12913 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12914 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12915 ; AVX-NEXT: vmovaps 3136(%rdi), %xmm13
12916 ; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12917 ; AVX-NEXT: vmovaps 3184(%rdi), %xmm2
12918 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12919 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1],xmm2[2,3]
12920 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
12921 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12922 ; AVX-NEXT: vmovdqa 288(%rdi), %xmm1
12923 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm10[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
12924 ; AVX-NEXT: vmovapd 384(%rdi), %ymm2
12925 ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12926 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm5[0],ymm2[1],ymm5[3],ymm2[2]
12927 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
12928 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12929 ; AVX-NEXT: vmovdqa 736(%rdi), %xmm1
12930 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm11[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
12931 ; AVX-NEXT: vmovapd 832(%rdi), %ymm14
12932 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm4[0],ymm14[1],ymm4[3],ymm14[2]
12933 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
12934 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12935 ; AVX-NEXT: vmovdqa 1184(%rdi), %xmm1
12936 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm12[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
12937 ; AVX-NEXT: vmovapd 1280(%rdi), %ymm13
12938 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm3[0],ymm13[1],ymm3[3],ymm13[2]
12939 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
12940 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12941 ; AVX-NEXT: vmovdqa 1632(%rdi), %xmm1
12942 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
12943 ; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
12944 ; AVX-NEXT: vmovapd 1728(%rdi), %ymm12
12945 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm6[0],ymm12[1],ymm6[3],ymm12[2]
12946 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
12947 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12948 ; AVX-NEXT: vmovdqa 2080(%rdi), %xmm1
12949 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
12950 ; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
12951 ; AVX-NEXT: vmovapd 2176(%rdi), %ymm11
12952 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm7[0],ymm11[1],ymm7[3],ymm11[2]
12953 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
12954 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12955 ; AVX-NEXT: vmovdqa 2528(%rdi), %xmm1
12956 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
12957 ; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
12958 ; AVX-NEXT: vmovapd 2624(%rdi), %ymm10
12959 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm8[0],ymm10[1],ymm8[3],ymm10[2]
12960 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2,3]
12961 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12962 ; AVX-NEXT: vmovdqa 2976(%rdi), %xmm1
12963 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
12964 ; AVX-NEXT: vmovapd 3072(%rdi), %ymm2
12965 ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12966 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12967 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[3],ymm2[2]
12968 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
12969 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12970 ; AVX-NEXT: vmovdqa 3424(%rdi), %xmm0
12971 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
12972 ; AVX-NEXT: vmovapd 3520(%rdi), %ymm15
12973 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12974 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0],ymm15[1],ymm1[3],ymm15[2]
12975 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
12976 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12977 ; AVX-NEXT: vmovapd 160(%rdi), %ymm0
12978 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12979 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12980 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
12981 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm1
12982 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12983 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
12984 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
12985 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12986 ; AVX-NEXT: vmovapd 608(%rdi), %ymm0
12987 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12988 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12989 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
12990 ; AVX-NEXT: vmovdqa 512(%rdi), %xmm1
12991 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12992 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
12993 ; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
12994 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
12995 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12996 ; AVX-NEXT: vmovapd 1056(%rdi), %ymm0
12997 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12998 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12999 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
13000 ; AVX-NEXT: vmovdqa 960(%rdi), %xmm1
13001 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13002 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13003 ; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
13004 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
13005 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13006 ; AVX-NEXT: vmovapd 1504(%rdi), %ymm0
13007 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13008 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13009 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[3],ymm0[2]
13010 ; AVX-NEXT: vmovdqa 1408(%rdi), %xmm1
13011 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13012 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm9[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
13013 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
13014 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13015 ; AVX-NEXT: vmovapd 1952(%rdi), %ymm9
13016 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13017 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm9[1],ymm0[3],ymm9[2]
13018 ; AVX-NEXT: vmovdqa 1856(%rdi), %xmm1
13019 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13020 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13021 ; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
13022 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
13023 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13024 ; AVX-NEXT: vmovapd 2400(%rdi), %ymm6
13025 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13026 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[3],ymm6[2]
13027 ; AVX-NEXT: vmovdqa 2304(%rdi), %xmm8
13028 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm1 # 16-byte Folded Reload
13029 ; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
13030 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
13031 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13032 ; AVX-NEXT: vmovapd 2848(%rdi), %ymm4
13033 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13034 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm4[1],ymm0[3],ymm4[2]
13035 ; AVX-NEXT: vmovdqa 2752(%rdi), %xmm5
13036 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
13037 ; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
13038 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
13039 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13040 ; AVX-NEXT: vmovapd 3296(%rdi), %ymm2
13041 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13042 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[2]
13043 ; AVX-NEXT: vmovdqa 3200(%rdi), %xmm3
13044 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm7 # 16-byte Folded Reload
13045 ; AVX-NEXT: # xmm7 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
13046 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
13047 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13048 ; AVX-NEXT: vmovaps 352(%rdi), %xmm0
13049 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13050 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
13051 ; AVX-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
13052 ; AVX-NEXT: # ymm1 = ymm0[0,1,2,3,4,5],mem[6,7]
13053 ; AVX-NEXT: vmovaps 240(%rdi), %xmm0
13054 ; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
13055 ; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1],mem[2,3]
13056 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm1[4,5,6,7]
13057 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13058 ; AVX-NEXT: vmovaps 800(%rdi), %xmm0
13059 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13060 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
13061 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm14[3]
13062 ; AVX-NEXT: vmovapd 688(%rdi), %xmm7
13063 ; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13064 ; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1]
13065 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
13066 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13067 ; AVX-NEXT: vmovaps 1248(%rdi), %xmm0
13068 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13069 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
13070 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm13[3]
13071 ; AVX-NEXT: vmovapd 1136(%rdi), %xmm7
13072 ; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13073 ; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1]
13074 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
13075 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13076 ; AVX-NEXT: vmovaps 1696(%rdi), %xmm0
13077 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13078 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
13079 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm12[3]
13080 ; AVX-NEXT: vmovapd 1584(%rdi), %xmm7
13081 ; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13082 ; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1]
13083 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
13084 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13085 ; AVX-NEXT: vmovaps 2144(%rdi), %xmm0
13086 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13087 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
13088 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm11[3]
13089 ; AVX-NEXT: vmovapd 2032(%rdi), %xmm7
13090 ; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13091 ; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1]
13092 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
13093 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13094 ; AVX-NEXT: vmovaps 2592(%rdi), %xmm0
13095 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13096 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
13097 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm10[3]
13098 ; AVX-NEXT: vmovapd 2480(%rdi), %xmm7
13099 ; AVX-NEXT: vmovupd %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13100 ; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm7[0],mem[1]
13101 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
13102 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13103 ; AVX-NEXT: vmovaps 3040(%rdi), %xmm0
13104 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13105 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
13106 ; AVX-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13107 ; AVX-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
13108 ; AVX-NEXT: vmovaps 2928(%rdi), %xmm7
13109 ; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13110 ; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0,1],mem[2,3]
13111 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
13112 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13113 ; AVX-NEXT: vmovaps 3488(%rdi), %xmm0
13114 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13115 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
13116 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm15[3]
13117 ; AVX-NEXT: vmovapd 3376(%rdi), %xmm1
13118 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13119 ; AVX-NEXT: vblendpd {{.*#+}} xmm7 = xmm1[0],mem[1]
13120 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm7[0,1],ymm0[2,3]
13121 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13122 ; AVX-NEXT: vmovaps 3264(%rdi), %xmm0
13123 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13124 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
13125 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm2[3]
13126 ; AVX-NEXT: vmovdqa 3152(%rdi), %xmm2
13127 ; AVX-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13128 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm3[4,5,6,7]
13129 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
13130 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13131 ; AVX-NEXT: vmovaps 2816(%rdi), %xmm0
13132 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13133 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
13134 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm4[3]
13135 ; AVX-NEXT: vmovdqa 2704(%rdi), %xmm1
13136 ; AVX-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13137 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4,5,6,7]
13138 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
13139 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13140 ; AVX-NEXT: vmovdqa 2368(%rdi), %xmm14
13141 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm0
13142 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3]
13143 ; AVX-NEXT: vmovdqa 2256(%rdi), %xmm10
13144 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm10[0,1,2,3],xmm8[4,5,6,7]
13145 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
13146 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13147 ; AVX-NEXT: vmovdqa 1920(%rdi), %xmm8
13148 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm1
13149 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm1[0,1,2],ymm9[3]
13150 ; AVX-NEXT: vmovapd 1808(%rdi), %xmm5
13151 ; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm2 # 16-byte Folded Reload
13152 ; AVX-NEXT: # xmm2 = xmm5[0],mem[1]
13153 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
13154 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13155 ; AVX-NEXT: vmovdqa 1472(%rdi), %xmm7
13156 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm1
13157 ; AVX-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
13158 ; AVX-NEXT: # ymm1 = ymm1[0,1,2],mem[3]
13159 ; AVX-NEXT: vmovapd 1360(%rdi), %xmm3
13160 ; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm2 # 16-byte Folded Reload
13161 ; AVX-NEXT: # xmm2 = xmm3[0],mem[1]
13162 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
13163 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13164 ; AVX-NEXT: vmovdqa 1024(%rdi), %xmm4
13165 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm1
13166 ; AVX-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm2 # 32-byte Folded Reload
13167 ; AVX-NEXT: # ymm2 = ymm1[0,1,2],mem[3]
13168 ; AVX-NEXT: vmovapd 912(%rdi), %xmm1
13169 ; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload
13170 ; AVX-NEXT: # xmm6 = xmm1[0],mem[1]
13171 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1],ymm2[2,3]
13172 ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13173 ; AVX-NEXT: vmovdqa 576(%rdi), %xmm9
13174 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm2
13175 ; AVX-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
13176 ; AVX-NEXT: # ymm2 = ymm2[0,1,2],mem[3]
13177 ; AVX-NEXT: vmovapd 464(%rdi), %xmm6
13178 ; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm11 # 16-byte Folded Reload
13179 ; AVX-NEXT: # xmm11 = xmm6[0],mem[1]
13180 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3]
13181 ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13182 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm2
13183 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm11
13184 ; AVX-NEXT: vblendpd $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
13185 ; AVX-NEXT: # ymm11 = ymm11[0,1,2],mem[3]
13186 ; AVX-NEXT: vmovapd 16(%rdi), %xmm12
13187 ; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm13 # 16-byte Folded Reload
13188 ; AVX-NEXT: # xmm13 = xmm12[0],mem[1]
13189 ; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm13[0,1],ymm11[2,3]
13190 ; AVX-NEXT: vmovupd %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13191 ; AVX-NEXT: vmovapd 80(%rdi), %xmm13
13192 ; AVX-NEXT: vshufpd {{.*#+}} ymm11 = ymm12[1],ymm13[0],ymm12[2],ymm13[3]
13193 ; AVX-NEXT: vmovdqa 192(%rdi), %xmm12
13194 ; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13195 ; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7]
13196 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
13197 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3]
13198 ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13199 ; AVX-NEXT: vmovapd 304(%rdi), %xmm2
13200 ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13201 ; AVX-NEXT: vmovupd (%rsp), %ymm0 # 32-byte Reload
13202 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm0[1],ymm2[0],ymm0[2],ymm2[3]
13203 ; AVX-NEXT: vmovdqa 416(%rdi), %xmm0
13204 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13205 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
13206 ; AVX-NEXT: # xmm11 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
13207 ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
13208 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm2[0,1],ymm11[2,3]
13209 ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13210 ; AVX-NEXT: vmovapd 528(%rdi), %xmm11
13211 ; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm11[0],ymm6[2],ymm11[3]
13212 ; AVX-NEXT: vmovdqa 640(%rdi), %xmm2
13213 ; AVX-NEXT: vpalignr {{.*#+}} xmm9 = xmm9[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
13214 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
13215 ; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3]
13216 ; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13217 ; AVX-NEXT: vmovapd 752(%rdi), %xmm9
13218 ; AVX-NEXT: vmovupd %ymm9, (%rsp) # 32-byte Spill
13219 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
13220 ; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm6[1],ymm9[0],ymm6[2],ymm9[3]
13221 ; AVX-NEXT: vmovdqa 864(%rdi), %xmm0
13222 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13223 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
13224 ; AVX-NEXT: # xmm9 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
13225 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
13226 ; AVX-NEXT: vblendpd {{.*#+}} ymm6 = ymm6[0,1],ymm9[2,3]
13227 ; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13228 ; AVX-NEXT: vmovapd 976(%rdi), %xmm9
13229 ; AVX-NEXT: vshufpd {{.*#+}} ymm6 = ymm1[1],ymm9[0],ymm1[2],ymm9[3]
13230 ; AVX-NEXT: vmovdqa 1088(%rdi), %xmm1
13231 ; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm4[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
13232 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
13233 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm6[0,1],ymm4[2,3]
13234 ; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13235 ; AVX-NEXT: vmovapd 1200(%rdi), %xmm6
13236 ; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13237 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13238 ; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm6[0],ymm4[2],ymm6[3]
13239 ; AVX-NEXT: vmovdqa 1312(%rdi), %xmm0
13240 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13241 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
13242 ; AVX-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
13243 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
13244 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm6[2,3]
13245 ; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13246 ; AVX-NEXT: vmovapd 1424(%rdi), %xmm0
13247 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13248 ; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm0[0],ymm3[2],ymm0[3]
13249 ; AVX-NEXT: vmovdqa 1536(%rdi), %xmm0
13250 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13251 ; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm7[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
13252 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
13253 ; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3]
13254 ; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13255 ; AVX-NEXT: vmovapd 1648(%rdi), %xmm7
13256 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
13257 ; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm3[1],ymm7[0],ymm3[2],ymm7[3]
13258 ; AVX-NEXT: vmovdqa 1760(%rdi), %xmm0
13259 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13260 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
13261 ; AVX-NEXT: # xmm4 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
13262 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
13263 ; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm4[2,3]
13264 ; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13265 ; AVX-NEXT: vmovapd 1872(%rdi), %xmm0
13266 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13267 ; AVX-NEXT: vshufpd {{.*#+}} ymm3 = ymm5[1],ymm0[0],ymm5[2],ymm0[3]
13268 ; AVX-NEXT: vmovdqa 1984(%rdi), %xmm0
13269 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13270 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm8[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
13271 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
13272 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm3[0,1],ymm0[2,3]
13273 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13274 ; AVX-NEXT: vmovapd 2096(%rdi), %xmm3
13275 ; AVX-NEXT: vmovupd %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13276 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13277 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm3[0],ymm0[2],ymm3[3]
13278 ; AVX-NEXT: vmovdqa 2208(%rdi), %xmm3
13279 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13280 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
13281 ; AVX-NEXT: # xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
13282 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
13283 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3]
13284 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13285 ; AVX-NEXT: vmovapd 2320(%rdi), %xmm0
13286 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13287 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[1],ymm0[0],ymm10[2],ymm0[3]
13288 ; AVX-NEXT: vmovdqa 2432(%rdi), %xmm3
13289 ; AVX-NEXT: vpalignr {{.*#+}} xmm4 = xmm14[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
13290 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
13291 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3]
13292 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13293 ; AVX-NEXT: vmovapd 2544(%rdi), %xmm14
13294 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13295 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],ymm14[0],ymm0[2],ymm14[3]
13296 ; AVX-NEXT: vmovdqa 2656(%rdi), %xmm4
13297 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13298 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
13299 ; AVX-NEXT: # xmm4 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
13300 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
13301 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm4[2,3]
13302 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13303 ; AVX-NEXT: vmovapd 2768(%rdi), %xmm4
13304 ; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13305 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13306 ; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm0[1],ymm4[0],ymm0[2],ymm4[3]
13307 ; AVX-NEXT: vmovdqa 2880(%rdi), %xmm8
13308 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm5 # 16-byte Folded Reload
13309 ; AVX-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
13310 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
13311 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3]
13312 ; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13313 ; AVX-NEXT: vmovapd 2992(%rdi), %xmm5
13314 ; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13315 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13316 ; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[3]
13317 ; AVX-NEXT: vmovdqa 3104(%rdi), %xmm5
13318 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13319 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
13320 ; AVX-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
13321 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
13322 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3]
13323 ; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13324 ; AVX-NEXT: vmovapd 3216(%rdi), %xmm5
13325 ; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13326 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13327 ; AVX-NEXT: vshufpd {{.*#+}} ymm4 = ymm4[1],ymm5[0],ymm4[2],ymm5[3]
13328 ; AVX-NEXT: vmovdqa 3328(%rdi), %xmm0
13329 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13330 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
13331 ; AVX-NEXT: # xmm5 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
13332 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
13333 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm5[2,3]
13334 ; AVX-NEXT: vmovupd %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13335 ; AVX-NEXT: vmovapd 3440(%rdi), %xmm5
13336 ; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13337 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
13338 ; AVX-NEXT: vshufpd {{.*#+}} ymm5 = ymm4[1],ymm5[0],ymm4[2],ymm5[3]
13339 ; AVX-NEXT: vmovdqa 3552(%rdi), %xmm4
13340 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm6 # 16-byte Folded Reload
13341 ; AVX-NEXT: # xmm6 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
13342 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
13343 ; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm6[2,3]
13344 ; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13345 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload
13346 ; AVX-NEXT: vmovapd 128(%rdi), %ymm6
13347 ; AVX-NEXT: vmovupd %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13348 ; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3]
13349 ; AVX-NEXT: vblendpd {{.*#+}} xmm6 = mem[0],xmm13[1]
13350 ; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3]
13351 ; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13352 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload
13353 ; AVX-NEXT: vmovapd 352(%rdi), %ymm13
13354 ; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm13[0,1,2],ymm5[3]
13355 ; AVX-NEXT: vmovapd 256(%rdi), %xmm6
13356 ; AVX-NEXT: vmovapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13357 ; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
13358 ; AVX-NEXT: # xmm6 = xmm6[0],mem[1]
13359 ; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3]
13360 ; AVX-NEXT: vmovupd %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13361 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
13362 ; AVX-NEXT: vmovapd 576(%rdi), %ymm6
13363 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm6[0,1,2],ymm2[3]
13364 ; AVX-NEXT: vmovapd 480(%rdi), %xmm0
13365 ; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13366 ; AVX-NEXT: vblendpd {{.*#+}} xmm5 = xmm0[0],xmm11[1]
13367 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3]
13368 ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13369 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 16-byte Folded Reload
13370 ; AVX-NEXT: vmovapd 800(%rdi), %ymm11
13371 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm11[0,1,2],ymm2[3]
13372 ; AVX-NEXT: vmovapd 704(%rdi), %xmm5
13373 ; AVX-NEXT: vmovapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13374 ; AVX-NEXT: vblendpd $2, (%rsp), %xmm5, %xmm5 # 16-byte Folded Reload
13375 ; AVX-NEXT: # xmm5 = xmm5[0],mem[1]
13376 ; AVX-NEXT: vblendpd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3]
13377 ; AVX-NEXT: vmovupd %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13378 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13379 ; AVX-NEXT: vmovapd 1024(%rdi), %ymm5
13380 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm5[0,1,2],ymm1[3]
13381 ; AVX-NEXT: vmovapd 928(%rdi), %xmm0
13382 ; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13383 ; AVX-NEXT: vblendpd {{.*#+}} xmm2 = xmm0[0],xmm9[1]
13384 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
13385 ; AVX-NEXT: vmovupd %ymm1, (%rsp) # 32-byte Spill
13386 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
13387 ; AVX-NEXT: vmovaps 1248(%rdi), %ymm0
13388 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13389 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
13390 ; AVX-NEXT: vmovaps 1152(%rdi), %xmm0
13391 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13392 ; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
13393 ; AVX-NEXT: # xmm2 = xmm0[0,1],mem[2,3]
13394 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
13395 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13396 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
13397 ; AVX-NEXT: vmovaps 1472(%rdi), %ymm9
13398 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
13399 ; AVX-NEXT: vmovaps 1376(%rdi), %xmm15
13400 ; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm2 # 16-byte Folded Reload
13401 ; AVX-NEXT: # xmm2 = xmm15[0,1],mem[2,3]
13402 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
13403 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13404 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
13405 ; AVX-NEXT: vmovapd 1696(%rdi), %ymm0
13406 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13407 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3]
13408 ; AVX-NEXT: vmovapd 1600(%rdi), %xmm0
13409 ; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13410 ; AVX-NEXT: vblendpd {{.*#+}} xmm2 = xmm0[0],xmm7[1]
13411 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
13412 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13413 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
13414 ; AVX-NEXT: vmovaps 1920(%rdi), %ymm7
13415 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
13416 ; AVX-NEXT: vmovaps 1824(%rdi), %xmm12
13417 ; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm2 # 16-byte Folded Reload
13418 ; AVX-NEXT: # xmm2 = xmm12[0,1],mem[2,3]
13419 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
13420 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13421 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
13422 ; AVX-NEXT: vmovapd 2144(%rdi), %ymm10
13423 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm10[0,1,2],ymm1[3]
13424 ; AVX-NEXT: vmovapd 2048(%rdi), %xmm0
13425 ; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13426 ; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
13427 ; AVX-NEXT: # xmm2 = xmm0[0],mem[1]
13428 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
13429 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13430 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1
13431 ; AVX-NEXT: vmovapd 2368(%rdi), %ymm3
13432 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3]
13433 ; AVX-NEXT: vmovapd 2272(%rdi), %xmm0
13434 ; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13435 ; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
13436 ; AVX-NEXT: # xmm2 = xmm0[0],mem[1]
13437 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
13438 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13439 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
13440 ; AVX-NEXT: vmovapd 2592(%rdi), %ymm0
13441 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13442 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3]
13443 ; AVX-NEXT: vmovapd 2496(%rdi), %xmm0
13444 ; AVX-NEXT: vmovapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13445 ; AVX-NEXT: vblendpd {{.*#+}} xmm2 = xmm0[0],xmm14[1]
13446 ; AVX-NEXT: vblendpd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3]
13447 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13448 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm0
13449 ; AVX-NEXT: vmovapd 2816(%rdi), %ymm2
13450 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3]
13451 ; AVX-NEXT: vmovapd 2720(%rdi), %xmm1
13452 ; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13453 ; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13454 ; AVX-NEXT: # xmm1 = xmm1[0],mem[1]
13455 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
13456 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13457 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
13458 ; AVX-NEXT: vmovaps 3040(%rdi), %ymm1
13459 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13460 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13461 ; AVX-NEXT: vmovaps 2944(%rdi), %xmm1
13462 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13463 ; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13464 ; AVX-NEXT: # xmm1 = xmm1[0,1],mem[2,3]
13465 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13466 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13467 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
13468 ; AVX-NEXT: vmovapd 3264(%rdi), %ymm14
13469 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1,2],ymm0[3]
13470 ; AVX-NEXT: vmovapd 3168(%rdi), %xmm1
13471 ; AVX-NEXT: vmovapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13472 ; AVX-NEXT: vblendpd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13473 ; AVX-NEXT: # xmm1 = xmm1[0],mem[1]
13474 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
13475 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13476 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
13477 ; AVX-NEXT: vmovaps 3488(%rdi), %ymm1
13478 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13479 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13480 ; AVX-NEXT: vmovaps 3392(%rdi), %xmm1
13481 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13482 ; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13483 ; AVX-NEXT: # xmm1 = xmm1[0,1],mem[2,3]
13484 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13485 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13486 ; AVX-NEXT: vmovapd 192(%rdi), %ymm1
13487 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13488 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13489 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2]
13490 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm1
13491 ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
13492 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
13493 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13494 ; AVX-NEXT: vmovapd 416(%rdi), %ymm0
13495 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13496 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm13[0],ymm0[1],ymm13[3],ymm0[2]
13497 ; AVX-NEXT: vmovdqa 320(%rdi), %xmm1
13498 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13499 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13500 ; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
13501 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
13502 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13503 ; AVX-NEXT: vmovdqa 544(%rdi), %xmm0
13504 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13505 ; AVX-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
13506 ; AVX-NEXT: vmovapd 640(%rdi), %ymm1
13507 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13508 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm6[0],ymm1[1],ymm6[3],ymm1[2]
13509 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
13510 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13511 ; AVX-NEXT: vmovapd 864(%rdi), %ymm0
13512 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13513 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[3],ymm0[2]
13514 ; AVX-NEXT: vmovdqa 768(%rdi), %xmm1
13515 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13516 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13517 ; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
13518 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
13519 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13520 ; AVX-NEXT: vmovdqa 992(%rdi), %xmm0
13521 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13522 ; AVX-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
13523 ; AVX-NEXT: vmovapd 1088(%rdi), %ymm1
13524 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13525 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[3],ymm1[2]
13526 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
13527 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13528 ; AVX-NEXT: vmovapd 1312(%rdi), %ymm1
13529 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13530 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13531 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2]
13532 ; AVX-NEXT: vmovdqa 1216(%rdi), %xmm1
13533 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13534 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13535 ; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
13536 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
13537 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13538 ; AVX-NEXT: vmovdqa 1440(%rdi), %xmm0
13539 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm15[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
13540 ; AVX-NEXT: vmovapd 1536(%rdi), %ymm15
13541 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm9[0],ymm15[1],ymm9[3],ymm15[2]
13542 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
13543 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13544 ; AVX-NEXT: vmovapd 1760(%rdi), %ymm1
13545 ; AVX-NEXT: vmovupd %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13546 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13547 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[2]
13548 ; AVX-NEXT: vmovdqa 1664(%rdi), %xmm13
13549 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload
13550 ; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
13551 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
13552 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13553 ; AVX-NEXT: vmovdqa 1888(%rdi), %xmm0
13554 ; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm12[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
13555 ; AVX-NEXT: vmovapd 1984(%rdi), %ymm11
13556 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[3],ymm11[2]
13557 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
13558 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13559 ; AVX-NEXT: vmovapd 2208(%rdi), %ymm12
13560 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm10[0],ymm12[1],ymm10[3],ymm12[2]
13561 ; AVX-NEXT: vmovdqa 2112(%rdi), %xmm10
13562 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload
13563 ; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
13564 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
13565 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13566 ; AVX-NEXT: vmovdqa 2336(%rdi), %xmm0
13567 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13568 ; AVX-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
13569 ; AVX-NEXT: vmovapd 2432(%rdi), %ymm9
13570 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm3[0],ymm9[1],ymm3[3],ymm9[2]
13571 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
13572 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13573 ; AVX-NEXT: vmovapd 2656(%rdi), %ymm8
13574 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13575 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm8[1],ymm0[3],ymm8[2]
13576 ; AVX-NEXT: vmovdqa 2560(%rdi), %xmm7
13577 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload
13578 ; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
13579 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
13580 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13581 ; AVX-NEXT: vmovdqa 2784(%rdi), %xmm0
13582 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13583 ; AVX-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
13584 ; AVX-NEXT: vmovapd 2880(%rdi), %ymm5
13585 ; AVX-NEXT: vshufpd {{.*#+}} ymm1 = ymm2[0],ymm5[1],ymm2[3],ymm5[2]
13586 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
13587 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13588 ; AVX-NEXT: vmovapd 3104(%rdi), %ymm6
13589 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13590 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[3],ymm6[2]
13591 ; AVX-NEXT: vmovdqa 3008(%rdi), %xmm4
13592 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm1 # 16-byte Folded Reload
13593 ; AVX-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
13594 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
13595 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13596 ; AVX-NEXT: vmovdqa 3232(%rdi), %xmm0
13597 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13598 ; AVX-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
13599 ; AVX-NEXT: vmovapd 3328(%rdi), %ymm3
13600 ; AVX-NEXT: vshufpd {{.*#+}} ymm2 = ymm14[0],ymm3[1],ymm14[3],ymm3[2]
13601 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3]
13602 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13603 ; AVX-NEXT: vmovapd 3552(%rdi), %ymm2
13604 ; AVX-NEXT: vmovupd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13605 ; AVX-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[3],ymm2[2]
13606 ; AVX-NEXT: vmovdqa 3456(%rdi), %xmm1
13607 ; AVX-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload
13608 ; AVX-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
13609 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm14[0,1],ymm0[2,3]
13610 ; AVX-NEXT: vmovupd %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13611 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13612 ; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13613 ; AVX-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7]
13614 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
13615 ; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3]
13616 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
13617 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13618 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13619 ; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13620 ; AVX-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7]
13621 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
13622 ; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
13623 ; AVX-NEXT: # xmm14 = mem[0,1],xmm14[2,3]
13624 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
13625 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13626 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13627 ; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13628 ; AVX-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7]
13629 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
13630 ; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3]
13631 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
13632 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13633 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13634 ; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13635 ; AVX-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7]
13636 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
13637 ; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
13638 ; AVX-NEXT: # xmm14 = mem[0,1],xmm14[2,3]
13639 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
13640 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13641 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13642 ; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13643 ; AVX-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7]
13644 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
13645 ; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3]
13646 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
13647 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13648 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13649 ; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13650 ; AVX-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7]
13651 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
13652 ; AVX-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
13653 ; AVX-NEXT: # xmm14 = mem[0,1],xmm14[2,3]
13654 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
13655 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13656 ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm0 # 32-byte Folded Reload
13657 ; AVX-NEXT: # ymm0 = mem[0,1,2],ymm15[3]
13658 ; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
13659 ; AVX-NEXT: vblendpd {{.*#+}} xmm14 = xmm14[0],mem[1]
13660 ; AVX-NEXT: vblendpd {{.*#+}} ymm15 = ymm14[0,1],ymm0[2,3]
13661 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13662 ; AVX-NEXT: vblendps $63, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
13663 ; AVX-NEXT: # ymm0 = mem[0,1,2,3,4,5],ymm0[6,7]
13664 ; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
13665 ; AVX-NEXT: # xmm13 = mem[0,1,2,3],xmm13[4,5,6,7]
13666 ; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm13[0,1,2,3],ymm0[4,5,6,7]
13667 ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
13668 ; AVX-NEXT: # ymm0 = mem[0,1,2],ymm11[3]
13669 ; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
13670 ; AVX-NEXT: vblendpd {{.*#+}} xmm11 = xmm11[0],mem[1]
13671 ; AVX-NEXT: vblendpd {{.*#+}} ymm11 = ymm11[0,1],ymm0[2,3]
13672 ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload
13673 ; AVX-NEXT: # ymm0 = mem[0,1,2],ymm12[3]
13674 ; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm10 # 16-byte Folded Reload
13675 ; AVX-NEXT: # xmm10 = mem[0,1,2,3],xmm10[4,5,6,7]
13676 ; AVX-NEXT: vblendpd {{.*#+}} ymm10 = ymm10[0,1],ymm0[2,3]
13677 ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload
13678 ; AVX-NEXT: # ymm0 = mem[0,1,2],ymm9[3]
13679 ; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
13680 ; AVX-NEXT: vblendpd {{.*#+}} xmm9 = xmm9[0],mem[1]
13681 ; AVX-NEXT: vblendpd {{.*#+}} ymm9 = ymm9[0,1],ymm0[2,3]
13682 ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
13683 ; AVX-NEXT: # ymm0 = mem[0,1,2],ymm8[3]
13684 ; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
13685 ; AVX-NEXT: # xmm7 = mem[0,1,2,3],xmm7[4,5,6,7]
13686 ; AVX-NEXT: vblendpd {{.*#+}} ymm8 = ymm7[0,1],ymm0[2,3]
13687 ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
13688 ; AVX-NEXT: # ymm0 = mem[0,1,2],ymm5[3]
13689 ; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13690 ; AVX-NEXT: vblendpd {{.*#+}} xmm5 = xmm5[0],mem[1]
13691 ; AVX-NEXT: vblendpd {{.*#+}} ymm5 = ymm5[0,1],ymm0[2,3]
13692 ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
13693 ; AVX-NEXT: # ymm0 = mem[0,1,2],ymm6[3]
13694 ; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
13695 ; AVX-NEXT: # xmm4 = mem[0,1,2,3],xmm4[4,5,6,7]
13696 ; AVX-NEXT: vblendpd {{.*#+}} ymm4 = ymm4[0,1],ymm0[2,3]
13697 ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
13698 ; AVX-NEXT: # ymm0 = mem[0,1,2],ymm3[3]
13699 ; AVX-NEXT: vmovapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
13700 ; AVX-NEXT: vblendpd {{.*#+}} xmm3 = xmm3[0],mem[1]
13701 ; AVX-NEXT: vblendpd {{.*#+}} ymm3 = ymm3[0,1],ymm0[2,3]
13702 ; AVX-NEXT: vblendpd $7, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
13703 ; AVX-NEXT: # ymm0 = mem[0,1,2],ymm2[3]
13704 ; AVX-NEXT: vpblendw $15, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13705 ; AVX-NEXT: # xmm1 = mem[0,1,2,3],xmm1[4,5,6,7]
13706 ; AVX-NEXT: vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
13707 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13708 ; AVX-NEXT: vmovaps %ymm1, 448(%rsi)
13709 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13710 ; AVX-NEXT: vmovaps %ymm1, 384(%rsi)
13711 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13712 ; AVX-NEXT: vmovaps %ymm1, 320(%rsi)
13713 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13714 ; AVX-NEXT: vmovaps %ymm1, 256(%rsi)
13715 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13716 ; AVX-NEXT: vmovaps %ymm1, 192(%rsi)
13717 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13718 ; AVX-NEXT: vmovaps %ymm1, 128(%rsi)
13719 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13720 ; AVX-NEXT: vmovaps %ymm1, 64(%rsi)
13721 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13722 ; AVX-NEXT: vmovaps %ymm1, (%rsi)
13723 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13724 ; AVX-NEXT: vmovaps %ymm1, 480(%rsi)
13725 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13726 ; AVX-NEXT: vmovaps %ymm1, 416(%rsi)
13727 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13728 ; AVX-NEXT: vmovaps %ymm1, 352(%rsi)
13729 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13730 ; AVX-NEXT: vmovaps %ymm1, 288(%rsi)
13731 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13732 ; AVX-NEXT: vmovaps %ymm1, 224(%rsi)
13733 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13734 ; AVX-NEXT: vmovaps %ymm1, 160(%rsi)
13735 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13736 ; AVX-NEXT: vmovaps %ymm1, 96(%rsi)
13737 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13738 ; AVX-NEXT: vmovaps %ymm1, 32(%rsi)
13739 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13740 ; AVX-NEXT: vmovaps %ymm1, 448(%rdx)
13741 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13742 ; AVX-NEXT: vmovaps %ymm1, 384(%rdx)
13743 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13744 ; AVX-NEXT: vmovaps %ymm1, 320(%rdx)
13745 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13746 ; AVX-NEXT: vmovaps %ymm1, 256(%rdx)
13747 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13748 ; AVX-NEXT: vmovaps %ymm1, 192(%rdx)
13749 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13750 ; AVX-NEXT: vmovaps %ymm1, 128(%rdx)
13751 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13752 ; AVX-NEXT: vmovaps %ymm1, 64(%rdx)
13753 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13754 ; AVX-NEXT: vmovaps %ymm1, (%rdx)
13755 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13756 ; AVX-NEXT: vmovaps %ymm1, 480(%rdx)
13757 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13758 ; AVX-NEXT: vmovaps %ymm1, 416(%rdx)
13759 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13760 ; AVX-NEXT: vmovaps %ymm1, 352(%rdx)
13761 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13762 ; AVX-NEXT: vmovaps %ymm1, 288(%rdx)
13763 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13764 ; AVX-NEXT: vmovaps %ymm1, 224(%rdx)
13765 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13766 ; AVX-NEXT: vmovaps %ymm1, 160(%rdx)
13767 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13768 ; AVX-NEXT: vmovaps %ymm1, 96(%rdx)
13769 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13770 ; AVX-NEXT: vmovaps %ymm1, 32(%rdx)
13771 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13772 ; AVX-NEXT: vmovaps %ymm1, (%rcx)
13773 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13774 ; AVX-NEXT: vmovaps %ymm1, 64(%rcx)
13775 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13776 ; AVX-NEXT: vmovaps %ymm1, 128(%rcx)
13777 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13778 ; AVX-NEXT: vmovaps %ymm1, 192(%rcx)
13779 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13780 ; AVX-NEXT: vmovaps %ymm1, 256(%rcx)
13781 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13782 ; AVX-NEXT: vmovaps %ymm1, 320(%rcx)
13783 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13784 ; AVX-NEXT: vmovaps %ymm1, 384(%rcx)
13785 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13786 ; AVX-NEXT: vmovaps %ymm1, 448(%rcx)
13787 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13788 ; AVX-NEXT: vmovaps %ymm1, 480(%rcx)
13789 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13790 ; AVX-NEXT: vmovaps %ymm1, 416(%rcx)
13791 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13792 ; AVX-NEXT: vmovaps %ymm1, 352(%rcx)
13793 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13794 ; AVX-NEXT: vmovaps %ymm1, 288(%rcx)
13795 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13796 ; AVX-NEXT: vmovaps %ymm1, 224(%rcx)
13797 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13798 ; AVX-NEXT: vmovaps %ymm1, 160(%rcx)
13799 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13800 ; AVX-NEXT: vmovaps %ymm1, 96(%rcx)
13801 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13802 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
13803 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13804 ; AVX-NEXT: vmovaps %ymm1, 480(%r8)
13805 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13806 ; AVX-NEXT: vmovaps %ymm1, 448(%r8)
13807 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13808 ; AVX-NEXT: vmovaps %ymm1, 416(%r8)
13809 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13810 ; AVX-NEXT: vmovaps %ymm1, 384(%r8)
13811 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13812 ; AVX-NEXT: vmovaps %ymm1, 352(%r8)
13813 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13814 ; AVX-NEXT: vmovaps %ymm1, 320(%r8)
13815 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13816 ; AVX-NEXT: vmovaps %ymm1, 288(%r8)
13817 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13818 ; AVX-NEXT: vmovaps %ymm1, 256(%r8)
13819 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13820 ; AVX-NEXT: vmovaps %ymm1, 224(%r8)
13821 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13822 ; AVX-NEXT: vmovaps %ymm1, 192(%r8)
13823 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13824 ; AVX-NEXT: vmovaps %ymm1, 160(%r8)
13825 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13826 ; AVX-NEXT: vmovaps %ymm1, 128(%r8)
13827 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13828 ; AVX-NEXT: vmovaps %ymm1, 96(%r8)
13829 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13830 ; AVX-NEXT: vmovaps %ymm1, 64(%r8)
13831 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13832 ; AVX-NEXT: vmovaps %ymm1, 32(%r8)
13833 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13834 ; AVX-NEXT: vmovaps %ymm1, (%r8)
13835 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13836 ; AVX-NEXT: vmovaps %ymm1, 480(%r9)
13837 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13838 ; AVX-NEXT: vmovaps %ymm1, 448(%r9)
13839 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13840 ; AVX-NEXT: vmovaps %ymm1, 416(%r9)
13841 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13842 ; AVX-NEXT: vmovaps %ymm1, 384(%r9)
13843 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13844 ; AVX-NEXT: vmovaps %ymm1, 352(%r9)
13845 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13846 ; AVX-NEXT: vmovaps %ymm1, 320(%r9)
13847 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13848 ; AVX-NEXT: vmovaps %ymm1, 288(%r9)
13849 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13850 ; AVX-NEXT: vmovaps %ymm1, 256(%r9)
13851 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13852 ; AVX-NEXT: vmovaps %ymm1, 224(%r9)
13853 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13854 ; AVX-NEXT: vmovaps %ymm1, 192(%r9)
13855 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13856 ; AVX-NEXT: vmovaps %ymm1, 160(%r9)
13857 ; AVX-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
13858 ; AVX-NEXT: vmovaps %ymm1, 128(%r9)
13859 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13860 ; AVX-NEXT: vmovaps %ymm1, 96(%r9)
13861 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13862 ; AVX-NEXT: vmovaps %ymm1, 64(%r9)
13863 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13864 ; AVX-NEXT: vmovaps %ymm1, 32(%r9)
13865 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13866 ; AVX-NEXT: vmovaps %ymm1, (%r9)
13867 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
13868 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13869 ; AVX-NEXT: vmovaps %ymm1, 480(%rax)
13870 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13871 ; AVX-NEXT: vmovaps %ymm1, 448(%rax)
13872 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13873 ; AVX-NEXT: vmovaps %ymm1, 416(%rax)
13874 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13875 ; AVX-NEXT: vmovaps %ymm1, 384(%rax)
13876 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13877 ; AVX-NEXT: vmovaps %ymm1, 352(%rax)
13878 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13879 ; AVX-NEXT: vmovaps %ymm1, 320(%rax)
13880 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13881 ; AVX-NEXT: vmovaps %ymm1, 288(%rax)
13882 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13883 ; AVX-NEXT: vmovaps %ymm1, 256(%rax)
13884 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13885 ; AVX-NEXT: vmovaps %ymm1, 224(%rax)
13886 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13887 ; AVX-NEXT: vmovaps %ymm1, 192(%rax)
13888 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13889 ; AVX-NEXT: vmovaps %ymm1, 160(%rax)
13890 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13891 ; AVX-NEXT: vmovaps %ymm1, 128(%rax)
13892 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13893 ; AVX-NEXT: vmovaps %ymm1, 96(%rax)
13894 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13895 ; AVX-NEXT: vmovaps %ymm1, 64(%rax)
13896 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13897 ; AVX-NEXT: vmovaps %ymm1, 32(%rax)
13898 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
13899 ; AVX-NEXT: vmovaps %ymm1, (%rax)
13900 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
13901 ; AVX-NEXT: vmovapd %ymm0, 480(%rax)
13902 ; AVX-NEXT: vmovapd %ymm3, 448(%rax)
13903 ; AVX-NEXT: vmovapd %ymm4, 416(%rax)
13904 ; AVX-NEXT: vmovapd %ymm5, 384(%rax)
13905 ; AVX-NEXT: vmovapd %ymm8, 352(%rax)
13906 ; AVX-NEXT: vmovapd %ymm9, 320(%rax)
13907 ; AVX-NEXT: vmovapd %ymm10, 288(%rax)
13908 ; AVX-NEXT: vmovapd %ymm11, 256(%rax)
13909 ; AVX-NEXT: vmovaps %ymm14, 224(%rax)
13910 ; AVX-NEXT: vmovapd %ymm15, 192(%rax)
13911 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13912 ; AVX-NEXT: vmovaps %ymm0, 160(%rax)
13913 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13914 ; AVX-NEXT: vmovaps %ymm0, 128(%rax)
13915 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13916 ; AVX-NEXT: vmovaps %ymm0, 96(%rax)
13917 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13918 ; AVX-NEXT: vmovaps %ymm0, 64(%rax)
13919 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13920 ; AVX-NEXT: vmovaps %ymm0, 32(%rax)
13921 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13922 ; AVX-NEXT: vmovaps %ymm0, (%rax)
13923 ; AVX-NEXT: addq $4232, %rsp # imm = 0x1088
13924 ; AVX-NEXT: vzeroupper
13927 ; AVX2-LABEL: load_i64_stride7_vf64:
13929 ; AVX2-NEXT: subq $3928, %rsp # imm = 0xF58
13930 ; AVX2-NEXT: vmovdqa 1216(%rdi), %ymm4
13931 ; AVX2-NEXT: vmovdqa 768(%rdi), %ymm5
13932 ; AVX2-NEXT: vmovdqa 320(%rdi), %ymm6
13933 ; AVX2-NEXT: vmovdqa 384(%rdi), %xmm0
13934 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13935 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13936 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
13937 ; AVX2-NEXT: vmovdqa 224(%rdi), %xmm9
13938 ; AVX2-NEXT: vmovdqa 272(%rdi), %xmm1
13939 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13940 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
13941 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13942 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13943 ; AVX2-NEXT: vmovdqa 832(%rdi), %xmm0
13944 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13945 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13946 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
13947 ; AVX2-NEXT: vmovdqa 672(%rdi), %xmm10
13948 ; AVX2-NEXT: vmovdqa 720(%rdi), %xmm1
13949 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13950 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
13951 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13952 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13953 ; AVX2-NEXT: vmovdqa 1280(%rdi), %xmm0
13954 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13955 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13956 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
13957 ; AVX2-NEXT: vmovdqa 1120(%rdi), %xmm11
13958 ; AVX2-NEXT: vmovdqa 1168(%rdi), %xmm1
13959 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13960 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3]
13961 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13962 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13963 ; AVX2-NEXT: vmovdqa 1664(%rdi), %ymm7
13964 ; AVX2-NEXT: vmovdqa 1728(%rdi), %xmm0
13965 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13966 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13967 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
13968 ; AVX2-NEXT: vmovdqa 1568(%rdi), %xmm12
13969 ; AVX2-NEXT: vmovdqa 1616(%rdi), %xmm1
13970 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13971 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3]
13972 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13973 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13974 ; AVX2-NEXT: vmovdqa 2112(%rdi), %ymm8
13975 ; AVX2-NEXT: vmovdqa 2176(%rdi), %xmm0
13976 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13977 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13978 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7]
13979 ; AVX2-NEXT: vmovdqa 2016(%rdi), %xmm2
13980 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13981 ; AVX2-NEXT: vmovdqa 2064(%rdi), %xmm1
13982 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13983 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
13984 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13985 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13986 ; AVX2-NEXT: vmovdqa 2560(%rdi), %ymm13
13987 ; AVX2-NEXT: vmovdqa 2624(%rdi), %xmm0
13988 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13989 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
13990 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
13991 ; AVX2-NEXT: vmovdqa 2464(%rdi), %xmm2
13992 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13993 ; AVX2-NEXT: vmovdqa 2512(%rdi), %xmm1
13994 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13995 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
13996 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13997 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13998 ; AVX2-NEXT: vmovdqa 3008(%rdi), %ymm14
13999 ; AVX2-NEXT: vmovdqa 3072(%rdi), %xmm0
14000 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14001 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14002 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
14003 ; AVX2-NEXT: vmovdqa 2912(%rdi), %xmm2
14004 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14005 ; AVX2-NEXT: vmovdqa 2960(%rdi), %xmm1
14006 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14007 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
14008 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14009 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14010 ; AVX2-NEXT: vmovaps 3456(%rdi), %ymm1
14011 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14012 ; AVX2-NEXT: vmovaps 3520(%rdi), %xmm0
14013 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14014 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
14015 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14016 ; AVX2-NEXT: vmovaps 3360(%rdi), %xmm2
14017 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14018 ; AVX2-NEXT: vmovaps 3408(%rdi), %xmm1
14019 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14020 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
14021 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14022 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14023 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm1
14024 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14025 ; AVX2-NEXT: vmovaps 160(%rdi), %xmm0
14026 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14027 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
14028 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14029 ; AVX2-NEXT: vmovaps 48(%rdi), %xmm1
14030 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14031 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
14032 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14033 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14034 ; AVX2-NEXT: vmovdqa 544(%rdi), %ymm1
14035 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14036 ; AVX2-NEXT: vmovdqa 608(%rdi), %xmm0
14037 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14038 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
14039 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14040 ; AVX2-NEXT: vmovdqa 448(%rdi), %xmm0
14041 ; AVX2-NEXT: vmovdqa 496(%rdi), %xmm2
14042 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14043 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3]
14044 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
14045 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14046 ; AVX2-NEXT: vmovdqa 992(%rdi), %ymm1
14047 ; AVX2-NEXT: vmovdqa 1056(%rdi), %xmm2
14048 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14049 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
14050 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7]
14051 ; AVX2-NEXT: vmovdqa 896(%rdi), %xmm15
14052 ; AVX2-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14053 ; AVX2-NEXT: vmovdqa 944(%rdi), %xmm3
14054 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14055 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3]
14056 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
14057 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14058 ; AVX2-NEXT: vmovdqa 1440(%rdi), %ymm3
14059 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14060 ; AVX2-NEXT: vmovdqa 1504(%rdi), %xmm2
14061 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14062 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
14063 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
14064 ; AVX2-NEXT: vmovdqa 1344(%rdi), %xmm15
14065 ; AVX2-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14066 ; AVX2-NEXT: vmovdqa 1392(%rdi), %xmm3
14067 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14068 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3]
14069 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
14070 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14071 ; AVX2-NEXT: vmovdqa 1888(%rdi), %ymm3
14072 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14073 ; AVX2-NEXT: vmovdqa 1952(%rdi), %xmm2
14074 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14075 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
14076 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
14077 ; AVX2-NEXT: vmovdqa 1792(%rdi), %xmm15
14078 ; AVX2-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14079 ; AVX2-NEXT: vmovdqa 1840(%rdi), %xmm3
14080 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14081 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3]
14082 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
14083 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14084 ; AVX2-NEXT: vmovdqa 2336(%rdi), %ymm3
14085 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14086 ; AVX2-NEXT: vmovdqa 2400(%rdi), %xmm2
14087 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14088 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
14089 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
14090 ; AVX2-NEXT: vmovdqa 2240(%rdi), %xmm15
14091 ; AVX2-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14092 ; AVX2-NEXT: vmovdqa 2288(%rdi), %xmm3
14093 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14094 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3]
14095 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
14096 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14097 ; AVX2-NEXT: vmovdqa 2784(%rdi), %ymm3
14098 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14099 ; AVX2-NEXT: vmovdqa 2848(%rdi), %xmm2
14100 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14101 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
14102 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
14103 ; AVX2-NEXT: vmovdqa 2688(%rdi), %xmm15
14104 ; AVX2-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14105 ; AVX2-NEXT: vmovdqa 2736(%rdi), %xmm3
14106 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14107 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3]
14108 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
14109 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14110 ; AVX2-NEXT: vmovdqa 3232(%rdi), %ymm3
14111 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14112 ; AVX2-NEXT: vmovdqa 3296(%rdi), %xmm2
14113 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14114 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
14115 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
14116 ; AVX2-NEXT: vmovdqa 3136(%rdi), %xmm15
14117 ; AVX2-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14118 ; AVX2-NEXT: vmovdqa 3184(%rdi), %xmm3
14119 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14120 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3]
14121 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
14122 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14123 ; AVX2-NEXT: vmovdqa 288(%rdi), %xmm2
14124 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm9[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
14125 ; AVX2-NEXT: vmovdqa 384(%rdi), %ymm3
14126 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14127 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
14128 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
14129 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14130 ; AVX2-NEXT: vmovdqa 736(%rdi), %xmm2
14131 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm10[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
14132 ; AVX2-NEXT: vmovdqa 832(%rdi), %ymm3
14133 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14134 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm5[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
14135 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
14136 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14137 ; AVX2-NEXT: vmovdqa 1184(%rdi), %xmm2
14138 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm11[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
14139 ; AVX2-NEXT: vmovdqa 1280(%rdi), %ymm3
14140 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14141 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
14142 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
14143 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14144 ; AVX2-NEXT: vmovdqa 1632(%rdi), %xmm2
14145 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm12[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
14146 ; AVX2-NEXT: vmovdqa 1728(%rdi), %ymm15
14147 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm7[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
14148 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
14149 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14150 ; AVX2-NEXT: vmovdqa 2080(%rdi), %xmm2
14151 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
14152 ; AVX2-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
14153 ; AVX2-NEXT: vmovdqa 2176(%rdi), %ymm12
14154 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm8[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
14155 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
14156 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14157 ; AVX2-NEXT: vmovdqa 2528(%rdi), %xmm2
14158 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
14159 ; AVX2-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
14160 ; AVX2-NEXT: vmovdqa 2624(%rdi), %ymm11
14161 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm13[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
14162 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
14163 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14164 ; AVX2-NEXT: vmovdqa 2976(%rdi), %xmm2
14165 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
14166 ; AVX2-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
14167 ; AVX2-NEXT: vmovdqa 3072(%rdi), %ymm10
14168 ; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
14169 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
14170 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14171 ; AVX2-NEXT: vmovdqa 3424(%rdi), %xmm2
14172 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
14173 ; AVX2-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
14174 ; AVX2-NEXT: vmovdqa 3520(%rdi), %ymm9
14175 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
14176 ; AVX2-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
14177 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
14178 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14179 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm2
14180 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14181 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
14182 ; AVX2-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
14183 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm3
14184 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14185 ; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
14186 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
14187 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14188 ; AVX2-NEXT: vmovdqa 608(%rdi), %ymm2
14189 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14190 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
14191 ; AVX2-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
14192 ; AVX2-NEXT: vmovdqa 512(%rdi), %xmm3
14193 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14194 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
14195 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
14196 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14197 ; AVX2-NEXT: vmovdqa 1056(%rdi), %ymm0
14198 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14199 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
14200 ; AVX2-NEXT: vmovdqa 960(%rdi), %xmm1
14201 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14202 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14203 ; AVX2-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
14204 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14205 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14206 ; AVX2-NEXT: vmovdqa 1504(%rdi), %ymm0
14207 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14208 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14209 ; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
14210 ; AVX2-NEXT: vmovdqa 1408(%rdi), %xmm13
14211 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload
14212 ; AVX2-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
14213 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14214 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14215 ; AVX2-NEXT: vmovdqa 1952(%rdi), %ymm8
14216 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
14217 ; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
14218 ; AVX2-NEXT: vmovdqa 1856(%rdi), %xmm7
14219 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload
14220 ; AVX2-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
14221 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14222 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14223 ; AVX2-NEXT: vmovdqa 2400(%rdi), %ymm6
14224 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
14225 ; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
14226 ; AVX2-NEXT: vmovdqa 2304(%rdi), %xmm5
14227 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
14228 ; AVX2-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
14229 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14230 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14231 ; AVX2-NEXT: vmovdqa 2848(%rdi), %ymm4
14232 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
14233 ; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
14234 ; AVX2-NEXT: vmovdqa 2752(%rdi), %xmm3
14235 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload
14236 ; AVX2-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
14237 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14238 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14239 ; AVX2-NEXT: vmovdqa 3296(%rdi), %ymm2
14240 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
14241 ; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
14242 ; AVX2-NEXT: vmovdqa 3200(%rdi), %xmm1
14243 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload
14244 ; AVX2-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
14245 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
14246 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14247 ; AVX2-NEXT: vbroadcastsd 352(%rdi), %ymm0
14248 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14249 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14250 ; AVX2-NEXT: vmovaps 240(%rdi), %xmm14
14251 ; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3]
14252 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
14253 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14254 ; AVX2-NEXT: vbroadcastsd 800(%rdi), %ymm0
14255 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14256 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14257 ; AVX2-NEXT: vmovaps 688(%rdi), %xmm14
14258 ; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3]
14259 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
14260 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14261 ; AVX2-NEXT: vbroadcastsd 1248(%rdi), %ymm0
14262 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14263 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14264 ; AVX2-NEXT: vmovaps 1136(%rdi), %xmm14
14265 ; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3]
14266 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
14267 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14268 ; AVX2-NEXT: vpbroadcastq 1696(%rdi), %ymm0
14269 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3]
14270 ; AVX2-NEXT: vmovdqa 1584(%rdi), %xmm14
14271 ; AVX2-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3]
14272 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
14273 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14274 ; AVX2-NEXT: vpbroadcastq 2144(%rdi), %ymm0
14275 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
14276 ; AVX2-NEXT: vmovdqa 2032(%rdi), %xmm12
14277 ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3]
14278 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
14279 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14280 ; AVX2-NEXT: vpbroadcastq 2592(%rdi), %ymm0
14281 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3]
14282 ; AVX2-NEXT: vmovdqa 2480(%rdi), %xmm11
14283 ; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
14284 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
14285 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14286 ; AVX2-NEXT: vpbroadcastq 3040(%rdi), %ymm0
14287 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3]
14288 ; AVX2-NEXT: vmovdqa 2928(%rdi), %xmm10
14289 ; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],mem[2,3]
14290 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
14291 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14292 ; AVX2-NEXT: vpbroadcastq 3488(%rdi), %ymm0
14293 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3]
14294 ; AVX2-NEXT: vmovdqa 3376(%rdi), %xmm9
14295 ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3]
14296 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
14297 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14298 ; AVX2-NEXT: vpbroadcastq 3264(%rdi), %ymm0
14299 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
14300 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
14301 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14302 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14303 ; AVX2-NEXT: vpbroadcastq 2816(%rdi), %ymm0
14304 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3]
14305 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm3[2,3]
14306 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14307 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14308 ; AVX2-NEXT: vpbroadcastq 2368(%rdi), %ymm0
14309 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3]
14310 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm5[2,3]
14311 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14312 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14313 ; AVX2-NEXT: vpbroadcastq 1920(%rdi), %ymm0
14314 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3]
14315 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm7[2,3]
14316 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14317 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14318 ; AVX2-NEXT: vpbroadcastq 1472(%rdi), %ymm0
14319 ; AVX2-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14320 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14321 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm13[2,3]
14322 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14323 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14324 ; AVX2-NEXT: vbroadcastsd 1024(%rdi), %ymm0
14325 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14326 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14327 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14328 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
14329 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14330 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14331 ; AVX2-NEXT: vbroadcastsd 576(%rdi), %ymm0
14332 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14333 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14334 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14335 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
14336 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14337 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14338 ; AVX2-NEXT: vbroadcastsd 128(%rdi), %ymm0
14339 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14340 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14341 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14342 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
14343 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14344 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14345 ; AVX2-NEXT: vmovdqa 64(%rdi), %ymm0
14346 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
14347 ; AVX2-NEXT: vmovdqa 192(%rdi), %xmm13
14348 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
14349 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14350 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
14351 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14352 ; AVX2-NEXT: vmovdqa 288(%rdi), %ymm0
14353 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
14354 ; AVX2-NEXT: vmovdqa 416(%rdi), %xmm12
14355 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7]
14356 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14357 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
14358 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14359 ; AVX2-NEXT: vmovdqa 512(%rdi), %ymm0
14360 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
14361 ; AVX2-NEXT: vmovdqa 640(%rdi), %xmm11
14362 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
14363 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14364 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
14365 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14366 ; AVX2-NEXT: vmovdqa 736(%rdi), %ymm0
14367 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
14368 ; AVX2-NEXT: vmovdqa 864(%rdi), %xmm10
14369 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
14370 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14371 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
14372 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14373 ; AVX2-NEXT: vmovdqa 960(%rdi), %ymm0
14374 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
14375 ; AVX2-NEXT: vmovdqa 1088(%rdi), %xmm9
14376 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
14377 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14378 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
14379 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14380 ; AVX2-NEXT: vmovdqa 1184(%rdi), %ymm0
14381 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
14382 ; AVX2-NEXT: vmovdqa 1312(%rdi), %xmm8
14383 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
14384 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14385 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
14386 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14387 ; AVX2-NEXT: vmovdqa 1408(%rdi), %ymm0
14388 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
14389 ; AVX2-NEXT: vmovdqa 1536(%rdi), %xmm7
14390 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
14391 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14392 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
14393 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14394 ; AVX2-NEXT: vmovdqa 1632(%rdi), %ymm0
14395 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
14396 ; AVX2-NEXT: vmovdqa 1760(%rdi), %xmm6
14397 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
14398 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14399 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
14400 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14401 ; AVX2-NEXT: vmovdqa 1856(%rdi), %ymm0
14402 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
14403 ; AVX2-NEXT: vmovdqa 1984(%rdi), %xmm5
14404 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
14405 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14406 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
14407 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14408 ; AVX2-NEXT: vmovdqa 2080(%rdi), %ymm0
14409 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
14410 ; AVX2-NEXT: vmovdqa 2208(%rdi), %xmm3
14411 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
14412 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14413 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
14414 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14415 ; AVX2-NEXT: vmovdqa 2304(%rdi), %ymm0
14416 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
14417 ; AVX2-NEXT: vmovdqa 2432(%rdi), %xmm2
14418 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
14419 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14420 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
14421 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14422 ; AVX2-NEXT: vmovdqa 2528(%rdi), %ymm0
14423 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
14424 ; AVX2-NEXT: vmovdqa 2656(%rdi), %xmm14
14425 ; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7]
14426 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
14427 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3]
14428 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14429 ; AVX2-NEXT: vmovdqa 2752(%rdi), %ymm1
14430 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
14431 ; AVX2-NEXT: vmovdqa 2880(%rdi), %xmm0
14432 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14433 ; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
14434 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
14435 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3]
14436 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14437 ; AVX2-NEXT: vmovdqa 2976(%rdi), %ymm1
14438 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
14439 ; AVX2-NEXT: vmovdqa 3104(%rdi), %xmm4
14440 ; AVX2-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
14441 ; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
14442 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm15[2,3]
14443 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14444 ; AVX2-NEXT: vmovdqa 3200(%rdi), %ymm1
14445 ; AVX2-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
14446 ; AVX2-NEXT: vmovdqa 3328(%rdi), %xmm0
14447 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14448 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
14449 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
14450 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3]
14451 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14452 ; AVX2-NEXT: vmovdqa 3424(%rdi), %ymm1
14453 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
14454 ; AVX2-NEXT: vmovdqa 3552(%rdi), %xmm0
14455 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14456 ; AVX2-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
14457 ; AVX2-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
14458 ; AVX2-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm15[2,3]
14459 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14460 ; AVX2-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm1
14461 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm15
14462 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7]
14463 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm13
14464 ; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
14465 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7]
14466 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14467 ; AVX2-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm1
14468 ; AVX2-NEXT: vmovdqa 352(%rdi), %ymm13
14469 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7]
14470 ; AVX2-NEXT: vmovdqa 256(%rdi), %xmm0
14471 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14472 ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm0[0,1],mem[2,3]
14473 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
14474 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14475 ; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm1
14476 ; AVX2-NEXT: vmovdqa 576(%rdi), %ymm12
14477 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7]
14478 ; AVX2-NEXT: vmovdqa 480(%rdi), %xmm0
14479 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14480 ; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm0[0,1],mem[2,3]
14481 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
14482 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14483 ; AVX2-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm1
14484 ; AVX2-NEXT: vmovdqa 800(%rdi), %ymm11
14485 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7]
14486 ; AVX2-NEXT: vmovdqa 704(%rdi), %xmm0
14487 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14488 ; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm0[0,1],mem[2,3]
14489 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7]
14490 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14491 ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm1
14492 ; AVX2-NEXT: vmovdqa 1024(%rdi), %ymm10
14493 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7]
14494 ; AVX2-NEXT: vmovdqa 928(%rdi), %xmm0
14495 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14496 ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm0[0,1],mem[2,3]
14497 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7]
14498 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14499 ; AVX2-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm1
14500 ; AVX2-NEXT: vmovdqa 1248(%rdi), %ymm9
14501 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
14502 ; AVX2-NEXT: vmovdqa 1152(%rdi), %xmm0
14503 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14504 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0,1],mem[2,3]
14505 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
14506 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14507 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm1
14508 ; AVX2-NEXT: vmovdqa 1472(%rdi), %ymm8
14509 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
14510 ; AVX2-NEXT: vmovdqa 1376(%rdi), %xmm0
14511 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14512 ; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],mem[2,3]
14513 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
14514 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14515 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm1
14516 ; AVX2-NEXT: vmovdqa 1696(%rdi), %ymm7
14517 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
14518 ; AVX2-NEXT: vmovdqa 1600(%rdi), %xmm0
14519 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14520 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3]
14521 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
14522 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14523 ; AVX2-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm1
14524 ; AVX2-NEXT: vmovdqa 1920(%rdi), %ymm6
14525 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
14526 ; AVX2-NEXT: vmovdqa 1824(%rdi), %xmm0
14527 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14528 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],mem[2,3]
14529 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
14530 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14531 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1
14532 ; AVX2-NEXT: vmovdqa 2144(%rdi), %ymm5
14533 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
14534 ; AVX2-NEXT: vmovdqa 2048(%rdi), %xmm0
14535 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14536 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],mem[2,3]
14537 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
14538 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14539 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1
14540 ; AVX2-NEXT: vmovdqa 2368(%rdi), %ymm3
14541 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
14542 ; AVX2-NEXT: vmovdqa 2272(%rdi), %xmm0
14543 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14544 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],mem[2,3]
14545 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
14546 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14547 ; AVX2-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0
14548 ; AVX2-NEXT: vmovdqa 2592(%rdi), %ymm1
14549 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14550 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14551 ; AVX2-NEXT: vmovdqa 2496(%rdi), %xmm1
14552 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14553 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
14554 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14555 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14556 ; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
14557 ; AVX2-NEXT: vmovdqa 2816(%rdi), %ymm2
14558 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
14559 ; AVX2-NEXT: vmovdqa 2720(%rdi), %xmm1
14560 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14561 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
14562 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14563 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14564 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
14565 ; AVX2-NEXT: vmovdqa 3040(%rdi), %ymm1
14566 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14567 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14568 ; AVX2-NEXT: vmovdqa 2944(%rdi), %xmm1
14569 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14570 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
14571 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14572 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14573 ; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
14574 ; AVX2-NEXT: vmovdqa 3264(%rdi), %ymm1
14575 ; AVX2-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
14576 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14577 ; AVX2-NEXT: vmovdqa 3168(%rdi), %xmm1
14578 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14579 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
14580 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14581 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14582 ; AVX2-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
14583 ; AVX2-NEXT: vmovdqa 3488(%rdi), %ymm1
14584 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14585 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14586 ; AVX2-NEXT: vmovdqa 3392(%rdi), %xmm1
14587 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14588 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
14589 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14590 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14591 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm0
14592 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14593 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
14594 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm1
14595 ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
14596 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14597 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14598 ; AVX2-NEXT: vmovdqa 416(%rdi), %ymm0
14599 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14600 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
14601 ; AVX2-NEXT: vmovdqa 320(%rdi), %xmm1
14602 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14603 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14604 ; AVX2-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
14605 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14606 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14607 ; AVX2-NEXT: vmovdqa 544(%rdi), %xmm0
14608 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14609 ; AVX2-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
14610 ; AVX2-NEXT: vmovdqa 640(%rdi), %ymm1
14611 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14612 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
14613 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14614 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14615 ; AVX2-NEXT: vmovdqa 864(%rdi), %ymm0
14616 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14617 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm11[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
14618 ; AVX2-NEXT: vmovdqa 768(%rdi), %xmm1
14619 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14620 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14621 ; AVX2-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
14622 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14623 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14624 ; AVX2-NEXT: vmovdqa 992(%rdi), %xmm0
14625 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14626 ; AVX2-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
14627 ; AVX2-NEXT: vmovdqa 1088(%rdi), %ymm1
14628 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14629 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
14630 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14631 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14632 ; AVX2-NEXT: vmovdqa 1312(%rdi), %ymm0
14633 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14634 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm9[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
14635 ; AVX2-NEXT: vmovdqa 1216(%rdi), %xmm1
14636 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14637 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14638 ; AVX2-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
14639 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14640 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14641 ; AVX2-NEXT: vmovdqa 1440(%rdi), %xmm0
14642 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14643 ; AVX2-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
14644 ; AVX2-NEXT: vmovdqa 1536(%rdi), %ymm14
14645 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
14646 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14647 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14648 ; AVX2-NEXT: vmovdqa 1760(%rdi), %ymm0
14649 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14650 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
14651 ; AVX2-NEXT: vmovdqa 1664(%rdi), %xmm13
14652 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload
14653 ; AVX2-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
14654 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14655 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14656 ; AVX2-NEXT: vmovdqa 1888(%rdi), %xmm0
14657 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14658 ; AVX2-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
14659 ; AVX2-NEXT: vmovdqa 1984(%rdi), %ymm10
14660 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
14661 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14662 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14663 ; AVX2-NEXT: vmovdqa 2208(%rdi), %ymm12
14664 ; AVX2-NEXT: vpalignr {{.*#+}} ymm0 = ymm5[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
14665 ; AVX2-NEXT: vmovdqa 2112(%rdi), %xmm11
14666 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
14667 ; AVX2-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
14668 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14669 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14670 ; AVX2-NEXT: vmovdqa 2336(%rdi), %xmm0
14671 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14672 ; AVX2-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
14673 ; AVX2-NEXT: vmovdqa 2432(%rdi), %ymm8
14674 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
14675 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14676 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14677 ; AVX2-NEXT: vmovdqa 2656(%rdi), %ymm9
14678 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload
14679 ; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
14680 ; AVX2-NEXT: vmovdqa 2560(%rdi), %xmm7
14681 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload
14682 ; AVX2-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
14683 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14684 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14685 ; AVX2-NEXT: vmovdqa 2784(%rdi), %xmm0
14686 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14687 ; AVX2-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
14688 ; AVX2-NEXT: vmovdqa 2880(%rdi), %ymm4
14689 ; AVX2-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
14690 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14691 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14692 ; AVX2-NEXT: vmovdqa 3104(%rdi), %ymm6
14693 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
14694 ; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
14695 ; AVX2-NEXT: vmovdqa 3008(%rdi), %xmm5
14696 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
14697 ; AVX2-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
14698 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14699 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14700 ; AVX2-NEXT: vmovdqa 3232(%rdi), %xmm0
14701 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
14702 ; AVX2-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
14703 ; AVX2-NEXT: vmovdqa 3328(%rdi), %ymm3
14704 ; AVX2-NEXT: vpalignr $8, (%rsp), %ymm3, %ymm2 # 32-byte Folded Reload
14705 ; AVX2-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
14706 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7]
14707 ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
14708 ; AVX2-NEXT: vmovdqa 3552(%rdi), %ymm2
14709 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
14710 ; AVX2-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
14711 ; AVX2-NEXT: vmovdqa 3456(%rdi), %xmm1
14712 ; AVX2-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload
14713 ; AVX2-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
14714 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
14715 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14716 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
14717 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14718 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14719 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
14720 ; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3]
14721 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
14722 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14723 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
14724 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14725 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14726 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
14727 ; AVX2-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
14728 ; AVX2-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
14729 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
14730 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14731 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
14732 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14733 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14734 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
14735 ; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3]
14736 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
14737 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14738 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
14739 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14740 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14741 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
14742 ; AVX2-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
14743 ; AVX2-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
14744 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
14745 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14746 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
14747 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14748 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14749 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
14750 ; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3]
14751 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
14752 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14753 ; AVX2-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
14754 ; AVX2-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14755 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14756 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
14757 ; AVX2-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
14758 ; AVX2-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
14759 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
14760 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14761 ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
14762 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3]
14763 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
14764 ; AVX2-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3]
14765 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
14766 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14767 ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
14768 ; AVX2-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14769 ; AVX2-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
14770 ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
14771 ; AVX2-NEXT: # xmm13 = mem[0,1],xmm13[2,3]
14772 ; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm0[4,5,6,7]
14773 ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
14774 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3]
14775 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
14776 ; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],mem[2,3]
14777 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm0[4,5,6,7]
14778 ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
14779 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
14780 ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload
14781 ; AVX2-NEXT: # xmm10 = mem[0,1],xmm11[2,3]
14782 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm0[4,5,6,7]
14783 ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
14784 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3]
14785 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
14786 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3]
14787 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0,1,2,3],ymm0[4,5,6,7]
14788 ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
14789 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3]
14790 ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
14791 ; AVX2-NEXT: # xmm7 = mem[0,1],xmm7[2,3]
14792 ; AVX2-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7]
14793 ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
14794 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3]
14795 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
14796 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
14797 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm0[4,5,6,7]
14798 ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
14799 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3]
14800 ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload
14801 ; AVX2-NEXT: # xmm4 = mem[0,1],xmm5[2,3]
14802 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7]
14803 ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
14804 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
14805 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
14806 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
14807 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
14808 ; AVX2-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload
14809 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
14810 ; AVX2-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14811 ; AVX2-NEXT: # xmm1 = mem[0,1],xmm1[2,3]
14812 ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm2[4,5,6,7]
14813 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14814 ; AVX2-NEXT: vmovaps %ymm1, 448(%rsi)
14815 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14816 ; AVX2-NEXT: vmovaps %ymm1, 384(%rsi)
14817 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14818 ; AVX2-NEXT: vmovaps %ymm1, 320(%rsi)
14819 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14820 ; AVX2-NEXT: vmovaps %ymm1, 256(%rsi)
14821 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14822 ; AVX2-NEXT: vmovaps %ymm1, 192(%rsi)
14823 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14824 ; AVX2-NEXT: vmovaps %ymm1, 128(%rsi)
14825 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14826 ; AVX2-NEXT: vmovaps %ymm1, 64(%rsi)
14827 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14828 ; AVX2-NEXT: vmovaps %ymm1, (%rsi)
14829 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14830 ; AVX2-NEXT: vmovaps %ymm1, 480(%rsi)
14831 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14832 ; AVX2-NEXT: vmovaps %ymm1, 416(%rsi)
14833 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14834 ; AVX2-NEXT: vmovaps %ymm1, 352(%rsi)
14835 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14836 ; AVX2-NEXT: vmovaps %ymm1, 288(%rsi)
14837 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14838 ; AVX2-NEXT: vmovaps %ymm1, 224(%rsi)
14839 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14840 ; AVX2-NEXT: vmovaps %ymm1, 160(%rsi)
14841 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14842 ; AVX2-NEXT: vmovaps %ymm1, 96(%rsi)
14843 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14844 ; AVX2-NEXT: vmovaps %ymm1, 32(%rsi)
14845 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14846 ; AVX2-NEXT: vmovaps %ymm1, 448(%rdx)
14847 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14848 ; AVX2-NEXT: vmovaps %ymm1, 384(%rdx)
14849 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14850 ; AVX2-NEXT: vmovaps %ymm1, 320(%rdx)
14851 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14852 ; AVX2-NEXT: vmovaps %ymm1, 256(%rdx)
14853 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14854 ; AVX2-NEXT: vmovaps %ymm1, 192(%rdx)
14855 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14856 ; AVX2-NEXT: vmovaps %ymm1, 128(%rdx)
14857 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14858 ; AVX2-NEXT: vmovaps %ymm1, 64(%rdx)
14859 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14860 ; AVX2-NEXT: vmovaps %ymm1, (%rdx)
14861 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14862 ; AVX2-NEXT: vmovaps %ymm1, 480(%rdx)
14863 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14864 ; AVX2-NEXT: vmovaps %ymm1, 416(%rdx)
14865 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14866 ; AVX2-NEXT: vmovaps %ymm1, 352(%rdx)
14867 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14868 ; AVX2-NEXT: vmovaps %ymm1, 288(%rdx)
14869 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14870 ; AVX2-NEXT: vmovaps %ymm1, 224(%rdx)
14871 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14872 ; AVX2-NEXT: vmovaps %ymm1, 160(%rdx)
14873 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14874 ; AVX2-NEXT: vmovaps %ymm1, 96(%rdx)
14875 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14876 ; AVX2-NEXT: vmovaps %ymm1, 32(%rdx)
14877 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14878 ; AVX2-NEXT: vmovaps %ymm1, (%rcx)
14879 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14880 ; AVX2-NEXT: vmovaps %ymm1, 64(%rcx)
14881 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14882 ; AVX2-NEXT: vmovaps %ymm1, 128(%rcx)
14883 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14884 ; AVX2-NEXT: vmovaps %ymm1, 192(%rcx)
14885 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14886 ; AVX2-NEXT: vmovaps %ymm1, 256(%rcx)
14887 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14888 ; AVX2-NEXT: vmovaps %ymm1, 320(%rcx)
14889 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14890 ; AVX2-NEXT: vmovaps %ymm1, 384(%rcx)
14891 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14892 ; AVX2-NEXT: vmovaps %ymm1, 448(%rcx)
14893 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14894 ; AVX2-NEXT: vmovaps %ymm1, 480(%rcx)
14895 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14896 ; AVX2-NEXT: vmovaps %ymm1, 416(%rcx)
14897 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14898 ; AVX2-NEXT: vmovaps %ymm1, 352(%rcx)
14899 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14900 ; AVX2-NEXT: vmovaps %ymm1, 288(%rcx)
14901 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14902 ; AVX2-NEXT: vmovaps %ymm1, 224(%rcx)
14903 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14904 ; AVX2-NEXT: vmovaps %ymm1, 160(%rcx)
14905 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14906 ; AVX2-NEXT: vmovaps %ymm1, 96(%rcx)
14907 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14908 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
14909 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14910 ; AVX2-NEXT: vmovaps %ymm1, 480(%r8)
14911 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14912 ; AVX2-NEXT: vmovaps %ymm1, 448(%r8)
14913 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14914 ; AVX2-NEXT: vmovaps %ymm1, 416(%r8)
14915 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14916 ; AVX2-NEXT: vmovaps %ymm1, 384(%r8)
14917 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14918 ; AVX2-NEXT: vmovaps %ymm1, 352(%r8)
14919 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14920 ; AVX2-NEXT: vmovaps %ymm1, 320(%r8)
14921 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14922 ; AVX2-NEXT: vmovaps %ymm1, 288(%r8)
14923 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14924 ; AVX2-NEXT: vmovaps %ymm1, 256(%r8)
14925 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14926 ; AVX2-NEXT: vmovaps %ymm1, 224(%r8)
14927 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14928 ; AVX2-NEXT: vmovaps %ymm1, 192(%r8)
14929 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14930 ; AVX2-NEXT: vmovaps %ymm1, 160(%r8)
14931 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14932 ; AVX2-NEXT: vmovaps %ymm1, 128(%r8)
14933 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14934 ; AVX2-NEXT: vmovaps %ymm1, 96(%r8)
14935 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14936 ; AVX2-NEXT: vmovaps %ymm1, 64(%r8)
14937 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14938 ; AVX2-NEXT: vmovaps %ymm1, 32(%r8)
14939 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14940 ; AVX2-NEXT: vmovaps %ymm1, (%r8)
14941 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14942 ; AVX2-NEXT: vmovaps %ymm1, 480(%r9)
14943 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14944 ; AVX2-NEXT: vmovaps %ymm1, 448(%r9)
14945 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14946 ; AVX2-NEXT: vmovaps %ymm1, 416(%r9)
14947 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14948 ; AVX2-NEXT: vmovaps %ymm1, 384(%r9)
14949 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14950 ; AVX2-NEXT: vmovaps %ymm1, 352(%r9)
14951 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14952 ; AVX2-NEXT: vmovaps %ymm1, 320(%r9)
14953 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14954 ; AVX2-NEXT: vmovaps %ymm1, 288(%r9)
14955 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14956 ; AVX2-NEXT: vmovaps %ymm1, 256(%r9)
14957 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14958 ; AVX2-NEXT: vmovaps %ymm1, 224(%r9)
14959 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14960 ; AVX2-NEXT: vmovaps %ymm1, 192(%r9)
14961 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14962 ; AVX2-NEXT: vmovaps %ymm1, 160(%r9)
14963 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14964 ; AVX2-NEXT: vmovaps %ymm1, 128(%r9)
14965 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14966 ; AVX2-NEXT: vmovaps %ymm1, 96(%r9)
14967 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14968 ; AVX2-NEXT: vmovaps %ymm1, 64(%r9)
14969 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14970 ; AVX2-NEXT: vmovaps %ymm1, 32(%r9)
14971 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14972 ; AVX2-NEXT: vmovaps %ymm1, (%r9)
14973 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
14974 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14975 ; AVX2-NEXT: vmovaps %ymm1, 480(%rax)
14976 ; AVX2-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
14977 ; AVX2-NEXT: vmovaps %ymm1, 448(%rax)
14978 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14979 ; AVX2-NEXT: vmovaps %ymm1, 416(%rax)
14980 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14981 ; AVX2-NEXT: vmovaps %ymm1, 384(%rax)
14982 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14983 ; AVX2-NEXT: vmovaps %ymm1, 352(%rax)
14984 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14985 ; AVX2-NEXT: vmovaps %ymm1, 320(%rax)
14986 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14987 ; AVX2-NEXT: vmovaps %ymm1, 288(%rax)
14988 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14989 ; AVX2-NEXT: vmovaps %ymm1, 256(%rax)
14990 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14991 ; AVX2-NEXT: vmovaps %ymm1, 224(%rax)
14992 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14993 ; AVX2-NEXT: vmovaps %ymm1, 192(%rax)
14994 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14995 ; AVX2-NEXT: vmovaps %ymm1, 160(%rax)
14996 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14997 ; AVX2-NEXT: vmovaps %ymm1, 128(%rax)
14998 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14999 ; AVX2-NEXT: vmovaps %ymm1, 96(%rax)
15000 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15001 ; AVX2-NEXT: vmovaps %ymm1, 64(%rax)
15002 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15003 ; AVX2-NEXT: vmovaps %ymm1, 32(%rax)
15004 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15005 ; AVX2-NEXT: vmovaps %ymm1, (%rax)
15006 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
15007 ; AVX2-NEXT: vmovdqa %ymm14, 480(%rax)
15008 ; AVX2-NEXT: vmovdqa %ymm0, 448(%rax)
15009 ; AVX2-NEXT: vmovdqa %ymm4, 416(%rax)
15010 ; AVX2-NEXT: vmovdqa %ymm8, 384(%rax)
15011 ; AVX2-NEXT: vmovdqa %ymm7, 352(%rax)
15012 ; AVX2-NEXT: vmovdqa %ymm10, 320(%rax)
15013 ; AVX2-NEXT: vmovdqa %ymm11, 288(%rax)
15014 ; AVX2-NEXT: vmovdqa %ymm13, 256(%rax)
15015 ; AVX2-NEXT: vmovdqa %ymm15, 224(%rax)
15016 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15017 ; AVX2-NEXT: vmovaps %ymm0, 192(%rax)
15018 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15019 ; AVX2-NEXT: vmovaps %ymm0, 160(%rax)
15020 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15021 ; AVX2-NEXT: vmovaps %ymm0, 128(%rax)
15022 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15023 ; AVX2-NEXT: vmovaps %ymm0, 96(%rax)
15024 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15025 ; AVX2-NEXT: vmovaps %ymm0, 64(%rax)
15026 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15027 ; AVX2-NEXT: vmovaps %ymm0, 32(%rax)
15028 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15029 ; AVX2-NEXT: vmovaps %ymm0, (%rax)
15030 ; AVX2-NEXT: addq $3928, %rsp # imm = 0xF58
15031 ; AVX2-NEXT: vzeroupper
15034 ; AVX2-FP-LABEL: load_i64_stride7_vf64:
15035 ; AVX2-FP: # %bb.0:
15036 ; AVX2-FP-NEXT: subq $3928, %rsp # imm = 0xF58
15037 ; AVX2-FP-NEXT: vmovdqa 1216(%rdi), %ymm4
15038 ; AVX2-FP-NEXT: vmovdqa 768(%rdi), %ymm5
15039 ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %ymm6
15040 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %xmm0
15041 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15042 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15043 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
15044 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %xmm9
15045 ; AVX2-FP-NEXT: vmovdqa 272(%rdi), %xmm1
15046 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15047 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
15048 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15049 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15050 ; AVX2-FP-NEXT: vmovdqa 832(%rdi), %xmm0
15051 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15052 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15053 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
15054 ; AVX2-FP-NEXT: vmovdqa 672(%rdi), %xmm10
15055 ; AVX2-FP-NEXT: vmovdqa 720(%rdi), %xmm1
15056 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15057 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
15058 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15059 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15060 ; AVX2-FP-NEXT: vmovdqa 1280(%rdi), %xmm0
15061 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15062 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15063 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
15064 ; AVX2-FP-NEXT: vmovdqa 1120(%rdi), %xmm11
15065 ; AVX2-FP-NEXT: vmovdqa 1168(%rdi), %xmm1
15066 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15067 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3]
15068 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15069 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15070 ; AVX2-FP-NEXT: vmovdqa 1664(%rdi), %ymm7
15071 ; AVX2-FP-NEXT: vmovdqa 1728(%rdi), %xmm0
15072 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15073 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15074 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
15075 ; AVX2-FP-NEXT: vmovdqa 1568(%rdi), %xmm12
15076 ; AVX2-FP-NEXT: vmovdqa 1616(%rdi), %xmm1
15077 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15078 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3]
15079 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15080 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15081 ; AVX2-FP-NEXT: vmovdqa 2112(%rdi), %ymm8
15082 ; AVX2-FP-NEXT: vmovdqa 2176(%rdi), %xmm0
15083 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15084 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15085 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7]
15086 ; AVX2-FP-NEXT: vmovdqa 2016(%rdi), %xmm2
15087 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15088 ; AVX2-FP-NEXT: vmovdqa 2064(%rdi), %xmm1
15089 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15090 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
15091 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15092 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15093 ; AVX2-FP-NEXT: vmovdqa 2560(%rdi), %ymm13
15094 ; AVX2-FP-NEXT: vmovdqa 2624(%rdi), %xmm0
15095 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15096 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15097 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
15098 ; AVX2-FP-NEXT: vmovdqa 2464(%rdi), %xmm2
15099 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15100 ; AVX2-FP-NEXT: vmovdqa 2512(%rdi), %xmm1
15101 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15102 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
15103 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15104 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15105 ; AVX2-FP-NEXT: vmovdqa 3008(%rdi), %ymm14
15106 ; AVX2-FP-NEXT: vmovdqa 3072(%rdi), %xmm0
15107 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15108 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15109 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
15110 ; AVX2-FP-NEXT: vmovdqa 2912(%rdi), %xmm2
15111 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15112 ; AVX2-FP-NEXT: vmovdqa 2960(%rdi), %xmm1
15113 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15114 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
15115 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15116 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15117 ; AVX2-FP-NEXT: vmovaps 3456(%rdi), %ymm1
15118 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15119 ; AVX2-FP-NEXT: vmovaps 3520(%rdi), %xmm0
15120 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15121 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
15122 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
15123 ; AVX2-FP-NEXT: vmovaps 3360(%rdi), %xmm2
15124 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15125 ; AVX2-FP-NEXT: vmovaps 3408(%rdi), %xmm1
15126 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15127 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
15128 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15129 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15130 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm1
15131 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15132 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm0
15133 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15134 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
15135 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
15136 ; AVX2-FP-NEXT: vmovaps 48(%rdi), %xmm1
15137 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15138 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
15139 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15140 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15141 ; AVX2-FP-NEXT: vmovdqa 544(%rdi), %ymm1
15142 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15143 ; AVX2-FP-NEXT: vmovdqa 608(%rdi), %xmm0
15144 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15145 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
15146 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7]
15147 ; AVX2-FP-NEXT: vmovdqa 448(%rdi), %xmm0
15148 ; AVX2-FP-NEXT: vmovdqa 496(%rdi), %xmm2
15149 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15150 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3]
15151 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
15152 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15153 ; AVX2-FP-NEXT: vmovdqa 992(%rdi), %ymm1
15154 ; AVX2-FP-NEXT: vmovdqa 1056(%rdi), %xmm2
15155 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15156 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
15157 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7]
15158 ; AVX2-FP-NEXT: vmovdqa 896(%rdi), %xmm15
15159 ; AVX2-FP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15160 ; AVX2-FP-NEXT: vmovdqa 944(%rdi), %xmm3
15161 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15162 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3]
15163 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
15164 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15165 ; AVX2-FP-NEXT: vmovdqa 1440(%rdi), %ymm3
15166 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15167 ; AVX2-FP-NEXT: vmovdqa 1504(%rdi), %xmm2
15168 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15169 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
15170 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
15171 ; AVX2-FP-NEXT: vmovdqa 1344(%rdi), %xmm15
15172 ; AVX2-FP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15173 ; AVX2-FP-NEXT: vmovdqa 1392(%rdi), %xmm3
15174 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15175 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3]
15176 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
15177 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15178 ; AVX2-FP-NEXT: vmovdqa 1888(%rdi), %ymm3
15179 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15180 ; AVX2-FP-NEXT: vmovdqa 1952(%rdi), %xmm2
15181 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15182 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
15183 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
15184 ; AVX2-FP-NEXT: vmovdqa 1792(%rdi), %xmm15
15185 ; AVX2-FP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15186 ; AVX2-FP-NEXT: vmovdqa 1840(%rdi), %xmm3
15187 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15188 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3]
15189 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
15190 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15191 ; AVX2-FP-NEXT: vmovdqa 2336(%rdi), %ymm3
15192 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15193 ; AVX2-FP-NEXT: vmovdqa 2400(%rdi), %xmm2
15194 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15195 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
15196 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
15197 ; AVX2-FP-NEXT: vmovdqa 2240(%rdi), %xmm15
15198 ; AVX2-FP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15199 ; AVX2-FP-NEXT: vmovdqa 2288(%rdi), %xmm3
15200 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15201 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3]
15202 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
15203 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15204 ; AVX2-FP-NEXT: vmovdqa 2784(%rdi), %ymm3
15205 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15206 ; AVX2-FP-NEXT: vmovdqa 2848(%rdi), %xmm2
15207 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15208 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
15209 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
15210 ; AVX2-FP-NEXT: vmovdqa 2688(%rdi), %xmm15
15211 ; AVX2-FP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15212 ; AVX2-FP-NEXT: vmovdqa 2736(%rdi), %xmm3
15213 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15214 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3]
15215 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
15216 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15217 ; AVX2-FP-NEXT: vmovdqa 3232(%rdi), %ymm3
15218 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15219 ; AVX2-FP-NEXT: vmovdqa 3296(%rdi), %xmm2
15220 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15221 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
15222 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
15223 ; AVX2-FP-NEXT: vmovdqa 3136(%rdi), %xmm15
15224 ; AVX2-FP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15225 ; AVX2-FP-NEXT: vmovdqa 3184(%rdi), %xmm3
15226 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15227 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3]
15228 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
15229 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15230 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm2
15231 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm9[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
15232 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm3
15233 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15234 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
15235 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
15236 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15237 ; AVX2-FP-NEXT: vmovdqa 736(%rdi), %xmm2
15238 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm10[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
15239 ; AVX2-FP-NEXT: vmovdqa 832(%rdi), %ymm3
15240 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15241 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm5[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
15242 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
15243 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15244 ; AVX2-FP-NEXT: vmovdqa 1184(%rdi), %xmm2
15245 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm11[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
15246 ; AVX2-FP-NEXT: vmovdqa 1280(%rdi), %ymm3
15247 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15248 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
15249 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
15250 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15251 ; AVX2-FP-NEXT: vmovdqa 1632(%rdi), %xmm2
15252 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm12[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
15253 ; AVX2-FP-NEXT: vmovdqa 1728(%rdi), %ymm15
15254 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm7[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
15255 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
15256 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15257 ; AVX2-FP-NEXT: vmovdqa 2080(%rdi), %xmm2
15258 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
15259 ; AVX2-FP-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
15260 ; AVX2-FP-NEXT: vmovdqa 2176(%rdi), %ymm12
15261 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm8[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
15262 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
15263 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15264 ; AVX2-FP-NEXT: vmovdqa 2528(%rdi), %xmm2
15265 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
15266 ; AVX2-FP-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
15267 ; AVX2-FP-NEXT: vmovdqa 2624(%rdi), %ymm11
15268 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm13[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
15269 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
15270 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15271 ; AVX2-FP-NEXT: vmovdqa 2976(%rdi), %xmm2
15272 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
15273 ; AVX2-FP-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
15274 ; AVX2-FP-NEXT: vmovdqa 3072(%rdi), %ymm10
15275 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
15276 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
15277 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15278 ; AVX2-FP-NEXT: vmovdqa 3424(%rdi), %xmm2
15279 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
15280 ; AVX2-FP-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
15281 ; AVX2-FP-NEXT: vmovdqa 3520(%rdi), %ymm9
15282 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
15283 ; AVX2-FP-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
15284 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
15285 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15286 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm2
15287 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15288 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
15289 ; AVX2-FP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
15290 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm3
15291 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15292 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
15293 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
15294 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15295 ; AVX2-FP-NEXT: vmovdqa 608(%rdi), %ymm2
15296 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15297 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
15298 ; AVX2-FP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
15299 ; AVX2-FP-NEXT: vmovdqa 512(%rdi), %xmm3
15300 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15301 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
15302 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
15303 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15304 ; AVX2-FP-NEXT: vmovdqa 1056(%rdi), %ymm0
15305 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15306 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
15307 ; AVX2-FP-NEXT: vmovdqa 960(%rdi), %xmm1
15308 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15309 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
15310 ; AVX2-FP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
15311 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15312 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15313 ; AVX2-FP-NEXT: vmovdqa 1504(%rdi), %ymm0
15314 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15315 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15316 ; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
15317 ; AVX2-FP-NEXT: vmovdqa 1408(%rdi), %xmm13
15318 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload
15319 ; AVX2-FP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
15320 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15321 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15322 ; AVX2-FP-NEXT: vmovdqa 1952(%rdi), %ymm8
15323 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
15324 ; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
15325 ; AVX2-FP-NEXT: vmovdqa 1856(%rdi), %xmm7
15326 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload
15327 ; AVX2-FP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
15328 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15329 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15330 ; AVX2-FP-NEXT: vmovdqa 2400(%rdi), %ymm6
15331 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
15332 ; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
15333 ; AVX2-FP-NEXT: vmovdqa 2304(%rdi), %xmm5
15334 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
15335 ; AVX2-FP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
15336 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15337 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15338 ; AVX2-FP-NEXT: vmovdqa 2848(%rdi), %ymm4
15339 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
15340 ; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
15341 ; AVX2-FP-NEXT: vmovdqa 2752(%rdi), %xmm3
15342 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload
15343 ; AVX2-FP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
15344 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15345 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15346 ; AVX2-FP-NEXT: vmovdqa 3296(%rdi), %ymm2
15347 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
15348 ; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
15349 ; AVX2-FP-NEXT: vmovdqa 3200(%rdi), %xmm1
15350 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload
15351 ; AVX2-FP-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
15352 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
15353 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15354 ; AVX2-FP-NEXT: vbroadcastsd 352(%rdi), %ymm0
15355 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15356 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15357 ; AVX2-FP-NEXT: vmovaps 240(%rdi), %xmm14
15358 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3]
15359 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
15360 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15361 ; AVX2-FP-NEXT: vbroadcastsd 800(%rdi), %ymm0
15362 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15363 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15364 ; AVX2-FP-NEXT: vmovaps 688(%rdi), %xmm14
15365 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3]
15366 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
15367 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15368 ; AVX2-FP-NEXT: vbroadcastsd 1248(%rdi), %ymm0
15369 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15370 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15371 ; AVX2-FP-NEXT: vmovaps 1136(%rdi), %xmm14
15372 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3]
15373 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
15374 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15375 ; AVX2-FP-NEXT: vpbroadcastq 1696(%rdi), %ymm0
15376 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3]
15377 ; AVX2-FP-NEXT: vmovdqa 1584(%rdi), %xmm14
15378 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3]
15379 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
15380 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15381 ; AVX2-FP-NEXT: vpbroadcastq 2144(%rdi), %ymm0
15382 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
15383 ; AVX2-FP-NEXT: vmovdqa 2032(%rdi), %xmm12
15384 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3]
15385 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
15386 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15387 ; AVX2-FP-NEXT: vpbroadcastq 2592(%rdi), %ymm0
15388 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3]
15389 ; AVX2-FP-NEXT: vmovdqa 2480(%rdi), %xmm11
15390 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
15391 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
15392 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15393 ; AVX2-FP-NEXT: vpbroadcastq 3040(%rdi), %ymm0
15394 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3]
15395 ; AVX2-FP-NEXT: vmovdqa 2928(%rdi), %xmm10
15396 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],mem[2,3]
15397 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
15398 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15399 ; AVX2-FP-NEXT: vpbroadcastq 3488(%rdi), %ymm0
15400 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3]
15401 ; AVX2-FP-NEXT: vmovdqa 3376(%rdi), %xmm9
15402 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3]
15403 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
15404 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15405 ; AVX2-FP-NEXT: vpbroadcastq 3264(%rdi), %ymm0
15406 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
15407 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
15408 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15409 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15410 ; AVX2-FP-NEXT: vpbroadcastq 2816(%rdi), %ymm0
15411 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3]
15412 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm3[2,3]
15413 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15414 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15415 ; AVX2-FP-NEXT: vpbroadcastq 2368(%rdi), %ymm0
15416 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3]
15417 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm5[2,3]
15418 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15419 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15420 ; AVX2-FP-NEXT: vpbroadcastq 1920(%rdi), %ymm0
15421 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3]
15422 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm7[2,3]
15423 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15424 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15425 ; AVX2-FP-NEXT: vpbroadcastq 1472(%rdi), %ymm0
15426 ; AVX2-FP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15427 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15428 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm13[2,3]
15429 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15430 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15431 ; AVX2-FP-NEXT: vbroadcastsd 1024(%rdi), %ymm0
15432 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15433 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15434 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15435 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
15436 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15437 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15438 ; AVX2-FP-NEXT: vbroadcastsd 576(%rdi), %ymm0
15439 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15440 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15441 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15442 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
15443 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15444 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15445 ; AVX2-FP-NEXT: vbroadcastsd 128(%rdi), %ymm0
15446 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15447 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15448 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15449 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
15450 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15451 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15452 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %ymm0
15453 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
15454 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm13
15455 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
15456 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15457 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
15458 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15459 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %ymm0
15460 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
15461 ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %xmm12
15462 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7]
15463 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15464 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
15465 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15466 ; AVX2-FP-NEXT: vmovdqa 512(%rdi), %ymm0
15467 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
15468 ; AVX2-FP-NEXT: vmovdqa 640(%rdi), %xmm11
15469 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
15470 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15471 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
15472 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15473 ; AVX2-FP-NEXT: vmovdqa 736(%rdi), %ymm0
15474 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
15475 ; AVX2-FP-NEXT: vmovdqa 864(%rdi), %xmm10
15476 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
15477 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15478 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
15479 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15480 ; AVX2-FP-NEXT: vmovdqa 960(%rdi), %ymm0
15481 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
15482 ; AVX2-FP-NEXT: vmovdqa 1088(%rdi), %xmm9
15483 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
15484 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15485 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
15486 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15487 ; AVX2-FP-NEXT: vmovdqa 1184(%rdi), %ymm0
15488 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
15489 ; AVX2-FP-NEXT: vmovdqa 1312(%rdi), %xmm8
15490 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
15491 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15492 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
15493 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15494 ; AVX2-FP-NEXT: vmovdqa 1408(%rdi), %ymm0
15495 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
15496 ; AVX2-FP-NEXT: vmovdqa 1536(%rdi), %xmm7
15497 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
15498 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15499 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
15500 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15501 ; AVX2-FP-NEXT: vmovdqa 1632(%rdi), %ymm0
15502 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
15503 ; AVX2-FP-NEXT: vmovdqa 1760(%rdi), %xmm6
15504 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
15505 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15506 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
15507 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15508 ; AVX2-FP-NEXT: vmovdqa 1856(%rdi), %ymm0
15509 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
15510 ; AVX2-FP-NEXT: vmovdqa 1984(%rdi), %xmm5
15511 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
15512 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15513 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
15514 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15515 ; AVX2-FP-NEXT: vmovdqa 2080(%rdi), %ymm0
15516 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
15517 ; AVX2-FP-NEXT: vmovdqa 2208(%rdi), %xmm3
15518 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
15519 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15520 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
15521 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15522 ; AVX2-FP-NEXT: vmovdqa 2304(%rdi), %ymm0
15523 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
15524 ; AVX2-FP-NEXT: vmovdqa 2432(%rdi), %xmm2
15525 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
15526 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15527 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
15528 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15529 ; AVX2-FP-NEXT: vmovdqa 2528(%rdi), %ymm0
15530 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
15531 ; AVX2-FP-NEXT: vmovdqa 2656(%rdi), %xmm14
15532 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7]
15533 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
15534 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3]
15535 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15536 ; AVX2-FP-NEXT: vmovdqa 2752(%rdi), %ymm1
15537 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
15538 ; AVX2-FP-NEXT: vmovdqa 2880(%rdi), %xmm0
15539 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15540 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
15541 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
15542 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3]
15543 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15544 ; AVX2-FP-NEXT: vmovdqa 2976(%rdi), %ymm1
15545 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
15546 ; AVX2-FP-NEXT: vmovdqa 3104(%rdi), %xmm4
15547 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
15548 ; AVX2-FP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
15549 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm15[2,3]
15550 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15551 ; AVX2-FP-NEXT: vmovdqa 3200(%rdi), %ymm1
15552 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
15553 ; AVX2-FP-NEXT: vmovdqa 3328(%rdi), %xmm0
15554 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15555 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
15556 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
15557 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3]
15558 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15559 ; AVX2-FP-NEXT: vmovdqa 3424(%rdi), %ymm1
15560 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
15561 ; AVX2-FP-NEXT: vmovdqa 3552(%rdi), %xmm0
15562 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15563 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
15564 ; AVX2-FP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
15565 ; AVX2-FP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm15[2,3]
15566 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15567 ; AVX2-FP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm1
15568 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm15
15569 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7]
15570 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm13
15571 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
15572 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7]
15573 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15574 ; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm1
15575 ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %ymm13
15576 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7]
15577 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm0
15578 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15579 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm0[0,1],mem[2,3]
15580 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
15581 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15582 ; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm1
15583 ; AVX2-FP-NEXT: vmovdqa 576(%rdi), %ymm12
15584 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7]
15585 ; AVX2-FP-NEXT: vmovdqa 480(%rdi), %xmm0
15586 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15587 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm0[0,1],mem[2,3]
15588 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
15589 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15590 ; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm1
15591 ; AVX2-FP-NEXT: vmovdqa 800(%rdi), %ymm11
15592 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7]
15593 ; AVX2-FP-NEXT: vmovdqa 704(%rdi), %xmm0
15594 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15595 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm0[0,1],mem[2,3]
15596 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7]
15597 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15598 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm1
15599 ; AVX2-FP-NEXT: vmovdqa 1024(%rdi), %ymm10
15600 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7]
15601 ; AVX2-FP-NEXT: vmovdqa 928(%rdi), %xmm0
15602 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15603 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm0[0,1],mem[2,3]
15604 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7]
15605 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15606 ; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm1
15607 ; AVX2-FP-NEXT: vmovdqa 1248(%rdi), %ymm9
15608 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
15609 ; AVX2-FP-NEXT: vmovdqa 1152(%rdi), %xmm0
15610 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15611 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0,1],mem[2,3]
15612 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
15613 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15614 ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm1
15615 ; AVX2-FP-NEXT: vmovdqa 1472(%rdi), %ymm8
15616 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
15617 ; AVX2-FP-NEXT: vmovdqa 1376(%rdi), %xmm0
15618 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15619 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],mem[2,3]
15620 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
15621 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15622 ; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm1
15623 ; AVX2-FP-NEXT: vmovdqa 1696(%rdi), %ymm7
15624 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
15625 ; AVX2-FP-NEXT: vmovdqa 1600(%rdi), %xmm0
15626 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15627 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3]
15628 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
15629 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15630 ; AVX2-FP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm1
15631 ; AVX2-FP-NEXT: vmovdqa 1920(%rdi), %ymm6
15632 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
15633 ; AVX2-FP-NEXT: vmovdqa 1824(%rdi), %xmm0
15634 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15635 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],mem[2,3]
15636 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
15637 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15638 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1
15639 ; AVX2-FP-NEXT: vmovdqa 2144(%rdi), %ymm5
15640 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
15641 ; AVX2-FP-NEXT: vmovdqa 2048(%rdi), %xmm0
15642 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15643 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],mem[2,3]
15644 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
15645 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15646 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1
15647 ; AVX2-FP-NEXT: vmovdqa 2368(%rdi), %ymm3
15648 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
15649 ; AVX2-FP-NEXT: vmovdqa 2272(%rdi), %xmm0
15650 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15651 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],mem[2,3]
15652 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
15653 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15654 ; AVX2-FP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0
15655 ; AVX2-FP-NEXT: vmovdqa 2592(%rdi), %ymm1
15656 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15657 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
15658 ; AVX2-FP-NEXT: vmovdqa 2496(%rdi), %xmm1
15659 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15660 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
15661 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15662 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15663 ; AVX2-FP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
15664 ; AVX2-FP-NEXT: vmovdqa 2816(%rdi), %ymm2
15665 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
15666 ; AVX2-FP-NEXT: vmovdqa 2720(%rdi), %xmm1
15667 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15668 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
15669 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15670 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15671 ; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
15672 ; AVX2-FP-NEXT: vmovdqa 3040(%rdi), %ymm1
15673 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15674 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
15675 ; AVX2-FP-NEXT: vmovdqa 2944(%rdi), %xmm1
15676 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15677 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
15678 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15679 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15680 ; AVX2-FP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
15681 ; AVX2-FP-NEXT: vmovdqa 3264(%rdi), %ymm1
15682 ; AVX2-FP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
15683 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
15684 ; AVX2-FP-NEXT: vmovdqa 3168(%rdi), %xmm1
15685 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15686 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
15687 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15688 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15689 ; AVX2-FP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
15690 ; AVX2-FP-NEXT: vmovdqa 3488(%rdi), %ymm1
15691 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15692 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
15693 ; AVX2-FP-NEXT: vmovdqa 3392(%rdi), %xmm1
15694 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15695 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
15696 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15697 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15698 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm0
15699 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15700 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
15701 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm1
15702 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
15703 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15704 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15705 ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm0
15706 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15707 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
15708 ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm1
15709 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15710 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
15711 ; AVX2-FP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
15712 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15713 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15714 ; AVX2-FP-NEXT: vmovdqa 544(%rdi), %xmm0
15715 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
15716 ; AVX2-FP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
15717 ; AVX2-FP-NEXT: vmovdqa 640(%rdi), %ymm1
15718 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15719 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
15720 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15721 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15722 ; AVX2-FP-NEXT: vmovdqa 864(%rdi), %ymm0
15723 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15724 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm11[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
15725 ; AVX2-FP-NEXT: vmovdqa 768(%rdi), %xmm1
15726 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15727 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
15728 ; AVX2-FP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
15729 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15730 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15731 ; AVX2-FP-NEXT: vmovdqa 992(%rdi), %xmm0
15732 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
15733 ; AVX2-FP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
15734 ; AVX2-FP-NEXT: vmovdqa 1088(%rdi), %ymm1
15735 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15736 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
15737 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15738 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15739 ; AVX2-FP-NEXT: vmovdqa 1312(%rdi), %ymm0
15740 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15741 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm9[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
15742 ; AVX2-FP-NEXT: vmovdqa 1216(%rdi), %xmm1
15743 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15744 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
15745 ; AVX2-FP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
15746 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15747 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15748 ; AVX2-FP-NEXT: vmovdqa 1440(%rdi), %xmm0
15749 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
15750 ; AVX2-FP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
15751 ; AVX2-FP-NEXT: vmovdqa 1536(%rdi), %ymm14
15752 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
15753 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15754 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15755 ; AVX2-FP-NEXT: vmovdqa 1760(%rdi), %ymm0
15756 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15757 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
15758 ; AVX2-FP-NEXT: vmovdqa 1664(%rdi), %xmm13
15759 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload
15760 ; AVX2-FP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
15761 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15762 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15763 ; AVX2-FP-NEXT: vmovdqa 1888(%rdi), %xmm0
15764 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
15765 ; AVX2-FP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
15766 ; AVX2-FP-NEXT: vmovdqa 1984(%rdi), %ymm10
15767 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
15768 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15769 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15770 ; AVX2-FP-NEXT: vmovdqa 2208(%rdi), %ymm12
15771 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm0 = ymm5[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
15772 ; AVX2-FP-NEXT: vmovdqa 2112(%rdi), %xmm11
15773 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
15774 ; AVX2-FP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
15775 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15776 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15777 ; AVX2-FP-NEXT: vmovdqa 2336(%rdi), %xmm0
15778 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
15779 ; AVX2-FP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
15780 ; AVX2-FP-NEXT: vmovdqa 2432(%rdi), %ymm8
15781 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
15782 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15783 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15784 ; AVX2-FP-NEXT: vmovdqa 2656(%rdi), %ymm9
15785 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload
15786 ; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
15787 ; AVX2-FP-NEXT: vmovdqa 2560(%rdi), %xmm7
15788 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload
15789 ; AVX2-FP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
15790 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15791 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15792 ; AVX2-FP-NEXT: vmovdqa 2784(%rdi), %xmm0
15793 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
15794 ; AVX2-FP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
15795 ; AVX2-FP-NEXT: vmovdqa 2880(%rdi), %ymm4
15796 ; AVX2-FP-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
15797 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15798 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15799 ; AVX2-FP-NEXT: vmovdqa 3104(%rdi), %ymm6
15800 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
15801 ; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
15802 ; AVX2-FP-NEXT: vmovdqa 3008(%rdi), %xmm5
15803 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
15804 ; AVX2-FP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
15805 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15806 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15807 ; AVX2-FP-NEXT: vmovdqa 3232(%rdi), %xmm0
15808 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
15809 ; AVX2-FP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
15810 ; AVX2-FP-NEXT: vmovdqa 3328(%rdi), %ymm3
15811 ; AVX2-FP-NEXT: vpalignr $8, (%rsp), %ymm3, %ymm2 # 32-byte Folded Reload
15812 ; AVX2-FP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
15813 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7]
15814 ; AVX2-FP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
15815 ; AVX2-FP-NEXT: vmovdqa 3552(%rdi), %ymm2
15816 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
15817 ; AVX2-FP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
15818 ; AVX2-FP-NEXT: vmovdqa 3456(%rdi), %xmm1
15819 ; AVX2-FP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload
15820 ; AVX2-FP-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
15821 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
15822 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15823 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
15824 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15825 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15826 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
15827 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3]
15828 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
15829 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15830 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
15831 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15832 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15833 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
15834 ; AVX2-FP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
15835 ; AVX2-FP-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
15836 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
15837 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15838 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
15839 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15840 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15841 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
15842 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3]
15843 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
15844 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15845 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
15846 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15847 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15848 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
15849 ; AVX2-FP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
15850 ; AVX2-FP-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
15851 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
15852 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15853 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
15854 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15855 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15856 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
15857 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3]
15858 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
15859 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15860 ; AVX2-FP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
15861 ; AVX2-FP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15862 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15863 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
15864 ; AVX2-FP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
15865 ; AVX2-FP-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
15866 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
15867 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15868 ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
15869 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3]
15870 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
15871 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3]
15872 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
15873 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15874 ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
15875 ; AVX2-FP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15876 ; AVX2-FP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
15877 ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
15878 ; AVX2-FP-NEXT: # xmm13 = mem[0,1],xmm13[2,3]
15879 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm0[4,5,6,7]
15880 ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
15881 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3]
15882 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
15883 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],mem[2,3]
15884 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm0[4,5,6,7]
15885 ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
15886 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
15887 ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload
15888 ; AVX2-FP-NEXT: # xmm10 = mem[0,1],xmm11[2,3]
15889 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm0[4,5,6,7]
15890 ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
15891 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3]
15892 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
15893 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3]
15894 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0,1,2,3],ymm0[4,5,6,7]
15895 ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
15896 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3]
15897 ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
15898 ; AVX2-FP-NEXT: # xmm7 = mem[0,1],xmm7[2,3]
15899 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7]
15900 ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
15901 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3]
15902 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
15903 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
15904 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm0[4,5,6,7]
15905 ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
15906 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3]
15907 ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload
15908 ; AVX2-FP-NEXT: # xmm4 = mem[0,1],xmm5[2,3]
15909 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7]
15910 ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
15911 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
15912 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
15913 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
15914 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
15915 ; AVX2-FP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload
15916 ; AVX2-FP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
15917 ; AVX2-FP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
15918 ; AVX2-FP-NEXT: # xmm1 = mem[0,1],xmm1[2,3]
15919 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm2[4,5,6,7]
15920 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15921 ; AVX2-FP-NEXT: vmovaps %ymm1, 448(%rsi)
15922 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15923 ; AVX2-FP-NEXT: vmovaps %ymm1, 384(%rsi)
15924 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15925 ; AVX2-FP-NEXT: vmovaps %ymm1, 320(%rsi)
15926 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15927 ; AVX2-FP-NEXT: vmovaps %ymm1, 256(%rsi)
15928 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15929 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rsi)
15930 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15931 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rsi)
15932 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15933 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rsi)
15934 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15935 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi)
15936 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15937 ; AVX2-FP-NEXT: vmovaps %ymm1, 480(%rsi)
15938 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15939 ; AVX2-FP-NEXT: vmovaps %ymm1, 416(%rsi)
15940 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15941 ; AVX2-FP-NEXT: vmovaps %ymm1, 352(%rsi)
15942 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15943 ; AVX2-FP-NEXT: vmovaps %ymm1, 288(%rsi)
15944 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15945 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rsi)
15946 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15947 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rsi)
15948 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15949 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rsi)
15950 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15951 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rsi)
15952 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15953 ; AVX2-FP-NEXT: vmovaps %ymm1, 448(%rdx)
15954 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15955 ; AVX2-FP-NEXT: vmovaps %ymm1, 384(%rdx)
15956 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15957 ; AVX2-FP-NEXT: vmovaps %ymm1, 320(%rdx)
15958 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15959 ; AVX2-FP-NEXT: vmovaps %ymm1, 256(%rdx)
15960 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15961 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rdx)
15962 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15963 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rdx)
15964 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15965 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rdx)
15966 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15967 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx)
15968 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15969 ; AVX2-FP-NEXT: vmovaps %ymm1, 480(%rdx)
15970 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15971 ; AVX2-FP-NEXT: vmovaps %ymm1, 416(%rdx)
15972 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15973 ; AVX2-FP-NEXT: vmovaps %ymm1, 352(%rdx)
15974 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15975 ; AVX2-FP-NEXT: vmovaps %ymm1, 288(%rdx)
15976 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15977 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rdx)
15978 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15979 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rdx)
15980 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15981 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rdx)
15982 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15983 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rdx)
15984 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15985 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rcx)
15986 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15987 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rcx)
15988 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15989 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rcx)
15990 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15991 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rcx)
15992 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15993 ; AVX2-FP-NEXT: vmovaps %ymm1, 256(%rcx)
15994 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15995 ; AVX2-FP-NEXT: vmovaps %ymm1, 320(%rcx)
15996 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15997 ; AVX2-FP-NEXT: vmovaps %ymm1, 384(%rcx)
15998 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15999 ; AVX2-FP-NEXT: vmovaps %ymm1, 448(%rcx)
16000 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16001 ; AVX2-FP-NEXT: vmovaps %ymm1, 480(%rcx)
16002 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16003 ; AVX2-FP-NEXT: vmovaps %ymm1, 416(%rcx)
16004 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16005 ; AVX2-FP-NEXT: vmovaps %ymm1, 352(%rcx)
16006 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16007 ; AVX2-FP-NEXT: vmovaps %ymm1, 288(%rcx)
16008 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16009 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rcx)
16010 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16011 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rcx)
16012 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16013 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rcx)
16014 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16015 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rcx)
16016 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16017 ; AVX2-FP-NEXT: vmovaps %ymm1, 480(%r8)
16018 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16019 ; AVX2-FP-NEXT: vmovaps %ymm1, 448(%r8)
16020 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16021 ; AVX2-FP-NEXT: vmovaps %ymm1, 416(%r8)
16022 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16023 ; AVX2-FP-NEXT: vmovaps %ymm1, 384(%r8)
16024 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16025 ; AVX2-FP-NEXT: vmovaps %ymm1, 352(%r8)
16026 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16027 ; AVX2-FP-NEXT: vmovaps %ymm1, 320(%r8)
16028 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16029 ; AVX2-FP-NEXT: vmovaps %ymm1, 288(%r8)
16030 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16031 ; AVX2-FP-NEXT: vmovaps %ymm1, 256(%r8)
16032 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16033 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%r8)
16034 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16035 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%r8)
16036 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16037 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%r8)
16038 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16039 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%r8)
16040 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16041 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%r8)
16042 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16043 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%r8)
16044 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16045 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r8)
16046 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16047 ; AVX2-FP-NEXT: vmovaps %ymm1, (%r8)
16048 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16049 ; AVX2-FP-NEXT: vmovaps %ymm1, 480(%r9)
16050 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16051 ; AVX2-FP-NEXT: vmovaps %ymm1, 448(%r9)
16052 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16053 ; AVX2-FP-NEXT: vmovaps %ymm1, 416(%r9)
16054 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16055 ; AVX2-FP-NEXT: vmovaps %ymm1, 384(%r9)
16056 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16057 ; AVX2-FP-NEXT: vmovaps %ymm1, 352(%r9)
16058 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16059 ; AVX2-FP-NEXT: vmovaps %ymm1, 320(%r9)
16060 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16061 ; AVX2-FP-NEXT: vmovaps %ymm1, 288(%r9)
16062 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16063 ; AVX2-FP-NEXT: vmovaps %ymm1, 256(%r9)
16064 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16065 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%r9)
16066 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16067 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%r9)
16068 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16069 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%r9)
16070 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16071 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%r9)
16072 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16073 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%r9)
16074 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16075 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%r9)
16076 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16077 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r9)
16078 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16079 ; AVX2-FP-NEXT: vmovaps %ymm1, (%r9)
16080 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
16081 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16082 ; AVX2-FP-NEXT: vmovaps %ymm1, 480(%rax)
16083 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
16084 ; AVX2-FP-NEXT: vmovaps %ymm1, 448(%rax)
16085 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16086 ; AVX2-FP-NEXT: vmovaps %ymm1, 416(%rax)
16087 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16088 ; AVX2-FP-NEXT: vmovaps %ymm1, 384(%rax)
16089 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16090 ; AVX2-FP-NEXT: vmovaps %ymm1, 352(%rax)
16091 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16092 ; AVX2-FP-NEXT: vmovaps %ymm1, 320(%rax)
16093 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16094 ; AVX2-FP-NEXT: vmovaps %ymm1, 288(%rax)
16095 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16096 ; AVX2-FP-NEXT: vmovaps %ymm1, 256(%rax)
16097 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16098 ; AVX2-FP-NEXT: vmovaps %ymm1, 224(%rax)
16099 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16100 ; AVX2-FP-NEXT: vmovaps %ymm1, 192(%rax)
16101 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16102 ; AVX2-FP-NEXT: vmovaps %ymm1, 160(%rax)
16103 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16104 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rax)
16105 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16106 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rax)
16107 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16108 ; AVX2-FP-NEXT: vmovaps %ymm1, 64(%rax)
16109 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16110 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rax)
16111 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
16112 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rax)
16113 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
16114 ; AVX2-FP-NEXT: vmovdqa %ymm14, 480(%rax)
16115 ; AVX2-FP-NEXT: vmovdqa %ymm0, 448(%rax)
16116 ; AVX2-FP-NEXT: vmovdqa %ymm4, 416(%rax)
16117 ; AVX2-FP-NEXT: vmovdqa %ymm8, 384(%rax)
16118 ; AVX2-FP-NEXT: vmovdqa %ymm7, 352(%rax)
16119 ; AVX2-FP-NEXT: vmovdqa %ymm10, 320(%rax)
16120 ; AVX2-FP-NEXT: vmovdqa %ymm11, 288(%rax)
16121 ; AVX2-FP-NEXT: vmovdqa %ymm13, 256(%rax)
16122 ; AVX2-FP-NEXT: vmovdqa %ymm15, 224(%rax)
16123 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16124 ; AVX2-FP-NEXT: vmovaps %ymm0, 192(%rax)
16125 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16126 ; AVX2-FP-NEXT: vmovaps %ymm0, 160(%rax)
16127 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16128 ; AVX2-FP-NEXT: vmovaps %ymm0, 128(%rax)
16129 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16130 ; AVX2-FP-NEXT: vmovaps %ymm0, 96(%rax)
16131 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16132 ; AVX2-FP-NEXT: vmovaps %ymm0, 64(%rax)
16133 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16134 ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax)
16135 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
16136 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rax)
16137 ; AVX2-FP-NEXT: addq $3928, %rsp # imm = 0xF58
16138 ; AVX2-FP-NEXT: vzeroupper
16139 ; AVX2-FP-NEXT: retq
16141 ; AVX2-FCP-LABEL: load_i64_stride7_vf64:
16142 ; AVX2-FCP: # %bb.0:
16143 ; AVX2-FCP-NEXT: subq $3928, %rsp # imm = 0xF58
16144 ; AVX2-FCP-NEXT: vmovdqa 1216(%rdi), %ymm4
16145 ; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %ymm5
16146 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm6
16147 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %xmm0
16148 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16149 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
16150 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
16151 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %xmm9
16152 ; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm1
16153 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16154 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
16155 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16156 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16157 ; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %xmm0
16158 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16159 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
16160 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
16161 ; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %xmm10
16162 ; AVX2-FCP-NEXT: vmovdqa 720(%rdi), %xmm1
16163 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16164 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
16165 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16166 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16167 ; AVX2-FCP-NEXT: vmovdqa 1280(%rdi), %xmm0
16168 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16169 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
16170 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
16171 ; AVX2-FCP-NEXT: vmovdqa 1120(%rdi), %xmm11
16172 ; AVX2-FCP-NEXT: vmovdqa 1168(%rdi), %xmm1
16173 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16174 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3]
16175 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16176 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16177 ; AVX2-FCP-NEXT: vmovdqa 1664(%rdi), %ymm7
16178 ; AVX2-FCP-NEXT: vmovdqa 1728(%rdi), %xmm0
16179 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16180 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
16181 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
16182 ; AVX2-FCP-NEXT: vmovdqa 1568(%rdi), %xmm12
16183 ; AVX2-FCP-NEXT: vmovdqa 1616(%rdi), %xmm1
16184 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16185 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm12[0,1],xmm1[2,3]
16186 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16187 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16188 ; AVX2-FCP-NEXT: vmovdqa 2112(%rdi), %ymm8
16189 ; AVX2-FCP-NEXT: vmovdqa 2176(%rdi), %xmm0
16190 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16191 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
16192 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm0[6,7]
16193 ; AVX2-FCP-NEXT: vmovdqa 2016(%rdi), %xmm2
16194 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16195 ; AVX2-FCP-NEXT: vmovdqa 2064(%rdi), %xmm1
16196 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16197 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
16198 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16199 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16200 ; AVX2-FCP-NEXT: vmovdqa 2560(%rdi), %ymm13
16201 ; AVX2-FCP-NEXT: vmovdqa 2624(%rdi), %xmm0
16202 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16203 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
16204 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4,5],ymm0[6,7]
16205 ; AVX2-FCP-NEXT: vmovdqa 2464(%rdi), %xmm2
16206 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16207 ; AVX2-FCP-NEXT: vmovdqa 2512(%rdi), %xmm1
16208 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16209 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
16210 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16211 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16212 ; AVX2-FCP-NEXT: vmovdqa 3008(%rdi), %ymm14
16213 ; AVX2-FCP-NEXT: vmovdqa 3072(%rdi), %xmm0
16214 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16215 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
16216 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5],ymm0[6,7]
16217 ; AVX2-FCP-NEXT: vmovdqa 2912(%rdi), %xmm2
16218 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16219 ; AVX2-FCP-NEXT: vmovdqa 2960(%rdi), %xmm1
16220 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16221 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
16222 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16223 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16224 ; AVX2-FCP-NEXT: vmovaps 3456(%rdi), %ymm1
16225 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16226 ; AVX2-FCP-NEXT: vmovaps 3520(%rdi), %xmm0
16227 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16228 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
16229 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
16230 ; AVX2-FCP-NEXT: vmovaps 3360(%rdi), %xmm2
16231 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16232 ; AVX2-FCP-NEXT: vmovaps 3408(%rdi), %xmm1
16233 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16234 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
16235 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16236 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16237 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm1
16238 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16239 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm0
16240 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16241 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
16242 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
16243 ; AVX2-FCP-NEXT: vmovaps 48(%rdi), %xmm1
16244 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16245 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
16246 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16247 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16248 ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %ymm1
16249 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16250 ; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %xmm0
16251 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16252 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
16253 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7]
16254 ; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %xmm0
16255 ; AVX2-FCP-NEXT: vmovdqa 496(%rdi), %xmm2
16256 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16257 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3]
16258 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
16259 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16260 ; AVX2-FCP-NEXT: vmovdqa 992(%rdi), %ymm1
16261 ; AVX2-FCP-NEXT: vmovdqa 1056(%rdi), %xmm2
16262 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16263 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
16264 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm2[6,7]
16265 ; AVX2-FCP-NEXT: vmovdqa 896(%rdi), %xmm15
16266 ; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16267 ; AVX2-FCP-NEXT: vmovdqa 944(%rdi), %xmm3
16268 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16269 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3]
16270 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
16271 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16272 ; AVX2-FCP-NEXT: vmovdqa 1440(%rdi), %ymm3
16273 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16274 ; AVX2-FCP-NEXT: vmovdqa 1504(%rdi), %xmm2
16275 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16276 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
16277 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
16278 ; AVX2-FCP-NEXT: vmovdqa 1344(%rdi), %xmm15
16279 ; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16280 ; AVX2-FCP-NEXT: vmovdqa 1392(%rdi), %xmm3
16281 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16282 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3]
16283 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
16284 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16285 ; AVX2-FCP-NEXT: vmovdqa 1888(%rdi), %ymm3
16286 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16287 ; AVX2-FCP-NEXT: vmovdqa 1952(%rdi), %xmm2
16288 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16289 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
16290 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
16291 ; AVX2-FCP-NEXT: vmovdqa 1792(%rdi), %xmm15
16292 ; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16293 ; AVX2-FCP-NEXT: vmovdqa 1840(%rdi), %xmm3
16294 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16295 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3]
16296 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
16297 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16298 ; AVX2-FCP-NEXT: vmovdqa 2336(%rdi), %ymm3
16299 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16300 ; AVX2-FCP-NEXT: vmovdqa 2400(%rdi), %xmm2
16301 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16302 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
16303 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
16304 ; AVX2-FCP-NEXT: vmovdqa 2240(%rdi), %xmm15
16305 ; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16306 ; AVX2-FCP-NEXT: vmovdqa 2288(%rdi), %xmm3
16307 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16308 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3]
16309 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
16310 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16311 ; AVX2-FCP-NEXT: vmovdqa 2784(%rdi), %ymm3
16312 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16313 ; AVX2-FCP-NEXT: vmovdqa 2848(%rdi), %xmm2
16314 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16315 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
16316 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
16317 ; AVX2-FCP-NEXT: vmovdqa 2688(%rdi), %xmm15
16318 ; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16319 ; AVX2-FCP-NEXT: vmovdqa 2736(%rdi), %xmm3
16320 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16321 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3]
16322 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
16323 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16324 ; AVX2-FCP-NEXT: vmovdqa 3232(%rdi), %ymm3
16325 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16326 ; AVX2-FCP-NEXT: vmovdqa 3296(%rdi), %xmm2
16327 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16328 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
16329 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
16330 ; AVX2-FCP-NEXT: vmovdqa 3136(%rdi), %xmm15
16331 ; AVX2-FCP-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16332 ; AVX2-FCP-NEXT: vmovdqa 3184(%rdi), %xmm3
16333 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16334 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm15[0,1],xmm3[2,3]
16335 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
16336 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16337 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm2
16338 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm9[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
16339 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm3
16340 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16341 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm6[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
16342 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
16343 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16344 ; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %xmm2
16345 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm10[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
16346 ; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %ymm3
16347 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16348 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm5[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
16349 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
16350 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16351 ; AVX2-FCP-NEXT: vmovdqa 1184(%rdi), %xmm2
16352 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm11[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
16353 ; AVX2-FCP-NEXT: vmovdqa 1280(%rdi), %ymm3
16354 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16355 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm4[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],ymm4[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
16356 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
16357 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16358 ; AVX2-FCP-NEXT: vmovdqa 1632(%rdi), %xmm2
16359 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm12[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
16360 ; AVX2-FCP-NEXT: vmovdqa 1728(%rdi), %ymm15
16361 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm7[8,9,10,11,12,13,14,15],ymm15[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm15[16,17,18,19,20,21,22,23]
16362 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
16363 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16364 ; AVX2-FCP-NEXT: vmovdqa 2080(%rdi), %xmm2
16365 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
16366 ; AVX2-FCP-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
16367 ; AVX2-FCP-NEXT: vmovdqa 2176(%rdi), %ymm12
16368 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm8[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
16369 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
16370 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16371 ; AVX2-FCP-NEXT: vmovdqa 2528(%rdi), %xmm2
16372 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
16373 ; AVX2-FCP-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
16374 ; AVX2-FCP-NEXT: vmovdqa 2624(%rdi), %ymm11
16375 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm13[8,9,10,11,12,13,14,15],ymm11[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm11[16,17,18,19,20,21,22,23]
16376 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
16377 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16378 ; AVX2-FCP-NEXT: vmovdqa 2976(%rdi), %xmm2
16379 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
16380 ; AVX2-FCP-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
16381 ; AVX2-FCP-NEXT: vmovdqa 3072(%rdi), %ymm10
16382 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm3 = ymm14[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm14[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
16383 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
16384 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16385 ; AVX2-FCP-NEXT: vmovdqa 3424(%rdi), %xmm2
16386 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
16387 ; AVX2-FCP-NEXT: # xmm2 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
16388 ; AVX2-FCP-NEXT: vmovdqa 3520(%rdi), %ymm9
16389 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm3 # 32-byte Folded Reload
16390 ; AVX2-FCP-NEXT: # ymm3 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
16391 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
16392 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16393 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm2
16394 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16395 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
16396 ; AVX2-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
16397 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm3
16398 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16399 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm3 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
16400 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
16401 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16402 ; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %ymm2
16403 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16404 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
16405 ; AVX2-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
16406 ; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %xmm3
16407 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16408 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
16409 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
16410 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16411 ; AVX2-FCP-NEXT: vmovdqa 1056(%rdi), %ymm0
16412 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16413 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm1[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm1[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
16414 ; AVX2-FCP-NEXT: vmovdqa 960(%rdi), %xmm1
16415 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16416 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
16417 ; AVX2-FCP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
16418 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16419 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16420 ; AVX2-FCP-NEXT: vmovdqa 1504(%rdi), %ymm0
16421 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16422 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16423 ; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
16424 ; AVX2-FCP-NEXT: vmovdqa 1408(%rdi), %xmm13
16425 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload
16426 ; AVX2-FCP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
16427 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16428 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16429 ; AVX2-FCP-NEXT: vmovdqa 1952(%rdi), %ymm8
16430 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
16431 ; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
16432 ; AVX2-FCP-NEXT: vmovdqa 1856(%rdi), %xmm7
16433 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload
16434 ; AVX2-FCP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
16435 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16436 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16437 ; AVX2-FCP-NEXT: vmovdqa 2400(%rdi), %ymm6
16438 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
16439 ; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
16440 ; AVX2-FCP-NEXT: vmovdqa 2304(%rdi), %xmm5
16441 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
16442 ; AVX2-FCP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
16443 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16444 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16445 ; AVX2-FCP-NEXT: vmovdqa 2848(%rdi), %ymm4
16446 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
16447 ; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
16448 ; AVX2-FCP-NEXT: vmovdqa 2752(%rdi), %xmm3
16449 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm1 # 16-byte Folded Reload
16450 ; AVX2-FCP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
16451 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16452 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16453 ; AVX2-FCP-NEXT: vmovdqa 3296(%rdi), %ymm2
16454 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
16455 ; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
16456 ; AVX2-FCP-NEXT: vmovdqa 3200(%rdi), %xmm1
16457 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload
16458 ; AVX2-FCP-NEXT: # xmm14 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
16459 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
16460 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16461 ; AVX2-FCP-NEXT: vbroadcastsd 352(%rdi), %ymm0
16462 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16463 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16464 ; AVX2-FCP-NEXT: vmovaps 240(%rdi), %xmm14
16465 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3]
16466 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
16467 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16468 ; AVX2-FCP-NEXT: vbroadcastsd 800(%rdi), %ymm0
16469 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16470 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16471 ; AVX2-FCP-NEXT: vmovaps 688(%rdi), %xmm14
16472 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3]
16473 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
16474 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16475 ; AVX2-FCP-NEXT: vbroadcastsd 1248(%rdi), %ymm0
16476 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16477 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16478 ; AVX2-FCP-NEXT: vmovaps 1136(%rdi), %xmm14
16479 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0,1],mem[2,3]
16480 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
16481 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16482 ; AVX2-FCP-NEXT: vpbroadcastq 1696(%rdi), %ymm0
16483 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm15[1],ymm0[3],ymm15[3]
16484 ; AVX2-FCP-NEXT: vmovdqa 1584(%rdi), %xmm14
16485 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3]
16486 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
16487 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16488 ; AVX2-FCP-NEXT: vpbroadcastq 2144(%rdi), %ymm0
16489 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
16490 ; AVX2-FCP-NEXT: vmovdqa 2032(%rdi), %xmm12
16491 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],mem[2,3]
16492 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
16493 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16494 ; AVX2-FCP-NEXT: vpbroadcastq 2592(%rdi), %ymm0
16495 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3]
16496 ; AVX2-FCP-NEXT: vmovdqa 2480(%rdi), %xmm11
16497 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],mem[2,3]
16498 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
16499 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16500 ; AVX2-FCP-NEXT: vpbroadcastq 3040(%rdi), %ymm0
16501 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3]
16502 ; AVX2-FCP-NEXT: vmovdqa 2928(%rdi), %xmm10
16503 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],mem[2,3]
16504 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
16505 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16506 ; AVX2-FCP-NEXT: vpbroadcastq 3488(%rdi), %ymm0
16507 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3]
16508 ; AVX2-FCP-NEXT: vmovdqa 3376(%rdi), %xmm9
16509 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],mem[2,3]
16510 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
16511 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16512 ; AVX2-FCP-NEXT: vpbroadcastq 3264(%rdi), %ymm0
16513 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm2[1],ymm0[3],ymm2[3]
16514 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
16515 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16516 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16517 ; AVX2-FCP-NEXT: vpbroadcastq 2816(%rdi), %ymm0
16518 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3]
16519 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm3[2,3]
16520 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16521 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16522 ; AVX2-FCP-NEXT: vpbroadcastq 2368(%rdi), %ymm0
16523 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3]
16524 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm5[2,3]
16525 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16526 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16527 ; AVX2-FCP-NEXT: vpbroadcastq 1920(%rdi), %ymm0
16528 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3]
16529 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm7[2,3]
16530 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16531 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16532 ; AVX2-FCP-NEXT: vpbroadcastq 1472(%rdi), %ymm0
16533 ; AVX2-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16534 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16535 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = mem[0,1],xmm13[2,3]
16536 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16537 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16538 ; AVX2-FCP-NEXT: vbroadcastsd 1024(%rdi), %ymm0
16539 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16540 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16541 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
16542 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
16543 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16544 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16545 ; AVX2-FCP-NEXT: vbroadcastsd 576(%rdi), %ymm0
16546 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16547 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16548 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
16549 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
16550 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16551 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16552 ; AVX2-FCP-NEXT: vbroadcastsd 128(%rdi), %ymm0
16553 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16554 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16555 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
16556 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
16557 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16558 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16559 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm0
16560 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
16561 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %xmm13
16562 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
16563 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
16564 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
16565 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16566 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %ymm0
16567 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
16568 ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %xmm12
16569 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm12[0,1,2,3,4,5,6,7]
16570 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
16571 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
16572 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16573 ; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %ymm0
16574 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
16575 ; AVX2-FCP-NEXT: vmovdqa 640(%rdi), %xmm11
16576 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
16577 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
16578 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
16579 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16580 ; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %ymm0
16581 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
16582 ; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %xmm10
16583 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm10[0,1,2,3,4,5,6,7]
16584 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
16585 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
16586 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16587 ; AVX2-FCP-NEXT: vmovdqa 960(%rdi), %ymm0
16588 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
16589 ; AVX2-FCP-NEXT: vmovdqa 1088(%rdi), %xmm9
16590 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm9[0,1,2,3,4,5,6,7]
16591 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
16592 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
16593 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16594 ; AVX2-FCP-NEXT: vmovdqa 1184(%rdi), %ymm0
16595 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
16596 ; AVX2-FCP-NEXT: vmovdqa 1312(%rdi), %xmm8
16597 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm8[0,1,2,3,4,5,6,7]
16598 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
16599 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
16600 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16601 ; AVX2-FCP-NEXT: vmovdqa 1408(%rdi), %ymm0
16602 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
16603 ; AVX2-FCP-NEXT: vmovdqa 1536(%rdi), %xmm7
16604 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
16605 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
16606 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
16607 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16608 ; AVX2-FCP-NEXT: vmovdqa 1632(%rdi), %ymm0
16609 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
16610 ; AVX2-FCP-NEXT: vmovdqa 1760(%rdi), %xmm6
16611 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
16612 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
16613 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
16614 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16615 ; AVX2-FCP-NEXT: vmovdqa 1856(%rdi), %ymm0
16616 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
16617 ; AVX2-FCP-NEXT: vmovdqa 1984(%rdi), %xmm5
16618 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
16619 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
16620 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
16621 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16622 ; AVX2-FCP-NEXT: vmovdqa 2080(%rdi), %ymm0
16623 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
16624 ; AVX2-FCP-NEXT: vmovdqa 2208(%rdi), %xmm3
16625 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7]
16626 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
16627 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
16628 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16629 ; AVX2-FCP-NEXT: vmovdqa 2304(%rdi), %ymm0
16630 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
16631 ; AVX2-FCP-NEXT: vmovdqa 2432(%rdi), %xmm2
16632 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
16633 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
16634 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
16635 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16636 ; AVX2-FCP-NEXT: vmovdqa 2528(%rdi), %ymm0
16637 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
16638 ; AVX2-FCP-NEXT: vmovdqa 2656(%rdi), %xmm14
16639 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm14[0,1,2,3,4,5,6,7]
16640 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
16641 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3]
16642 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16643 ; AVX2-FCP-NEXT: vmovdqa 2752(%rdi), %ymm1
16644 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
16645 ; AVX2-FCP-NEXT: vmovdqa 2880(%rdi), %xmm0
16646 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16647 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm4 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
16648 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
16649 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm4[2,3]
16650 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16651 ; AVX2-FCP-NEXT: vmovdqa 2976(%rdi), %ymm1
16652 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
16653 ; AVX2-FCP-NEXT: vmovdqa 3104(%rdi), %xmm4
16654 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4,5,6,7]
16655 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
16656 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm15[2,3]
16657 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16658 ; AVX2-FCP-NEXT: vmovdqa 3200(%rdi), %ymm1
16659 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm15 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
16660 ; AVX2-FCP-NEXT: vmovdqa 3328(%rdi), %xmm0
16661 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16662 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
16663 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
16664 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm15[2,3],ymm1[2,3]
16665 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16666 ; AVX2-FCP-NEXT: vmovdqa 3424(%rdi), %ymm1
16667 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
16668 ; AVX2-FCP-NEXT: vmovdqa 3552(%rdi), %xmm0
16669 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16670 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm15 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
16671 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15
16672 ; AVX2-FCP-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm15[2,3]
16673 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16674 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm1
16675 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm15
16676 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7]
16677 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm13
16678 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],mem[2,3]
16679 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3],ymm1[4,5,6,7]
16680 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16681 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm1
16682 ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm13
16683 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm13[0,1,2,3,4,5],ymm1[6,7]
16684 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm0
16685 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16686 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm0[0,1],mem[2,3]
16687 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16688 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16689 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm1
16690 ; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %ymm12
16691 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7]
16692 ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %xmm0
16693 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16694 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm0[0,1],mem[2,3]
16695 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
16696 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16697 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm1
16698 ; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %ymm11
16699 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7]
16700 ; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %xmm0
16701 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16702 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm0[0,1],mem[2,3]
16703 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7]
16704 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16705 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm1
16706 ; AVX2-FCP-NEXT: vmovdqa 1024(%rdi), %ymm10
16707 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7]
16708 ; AVX2-FCP-NEXT: vmovdqa 928(%rdi), %xmm0
16709 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16710 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm0[0,1],mem[2,3]
16711 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm1[4,5,6,7]
16712 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16713 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm1
16714 ; AVX2-FCP-NEXT: vmovdqa 1248(%rdi), %ymm9
16715 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm1[6,7]
16716 ; AVX2-FCP-NEXT: vmovdqa 1152(%rdi), %xmm0
16717 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16718 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm0[0,1],mem[2,3]
16719 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3],ymm1[4,5,6,7]
16720 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16721 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm1
16722 ; AVX2-FCP-NEXT: vmovdqa 1472(%rdi), %ymm8
16723 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
16724 ; AVX2-FCP-NEXT: vmovdqa 1376(%rdi), %xmm0
16725 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16726 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm0[0,1],mem[2,3]
16727 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
16728 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16729 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm1
16730 ; AVX2-FCP-NEXT: vmovdqa 1696(%rdi), %ymm7
16731 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
16732 ; AVX2-FCP-NEXT: vmovdqa 1600(%rdi), %xmm0
16733 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16734 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm0[0,1],mem[2,3]
16735 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
16736 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16737 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm1
16738 ; AVX2-FCP-NEXT: vmovdqa 1920(%rdi), %ymm6
16739 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
16740 ; AVX2-FCP-NEXT: vmovdqa 1824(%rdi), %xmm0
16741 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16742 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],mem[2,3]
16743 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
16744 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16745 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm1
16746 ; AVX2-FCP-NEXT: vmovdqa 2144(%rdi), %ymm5
16747 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
16748 ; AVX2-FCP-NEXT: vmovdqa 2048(%rdi), %xmm0
16749 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16750 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm0[0,1],mem[2,3]
16751 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
16752 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16753 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm1
16754 ; AVX2-FCP-NEXT: vmovdqa 2368(%rdi), %ymm3
16755 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
16756 ; AVX2-FCP-NEXT: vmovdqa 2272(%rdi), %xmm0
16757 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16758 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],mem[2,3]
16759 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
16760 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16761 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm0
16762 ; AVX2-FCP-NEXT: vmovdqa 2592(%rdi), %ymm1
16763 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16764 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
16765 ; AVX2-FCP-NEXT: vmovdqa 2496(%rdi), %xmm1
16766 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16767 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
16768 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16769 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16770 ; AVX2-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
16771 ; AVX2-FCP-NEXT: vmovdqa 2816(%rdi), %ymm2
16772 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
16773 ; AVX2-FCP-NEXT: vmovdqa 2720(%rdi), %xmm1
16774 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16775 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
16776 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16777 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16778 ; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0
16779 ; AVX2-FCP-NEXT: vmovdqa 3040(%rdi), %ymm1
16780 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16781 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
16782 ; AVX2-FCP-NEXT: vmovdqa 2944(%rdi), %xmm1
16783 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16784 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
16785 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16786 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16787 ; AVX2-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
16788 ; AVX2-FCP-NEXT: vmovdqa 3264(%rdi), %ymm1
16789 ; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
16790 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
16791 ; AVX2-FCP-NEXT: vmovdqa 3168(%rdi), %xmm1
16792 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16793 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
16794 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16795 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16796 ; AVX2-FCP-NEXT: vinserti128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 16-byte Folded Reload
16797 ; AVX2-FCP-NEXT: vmovdqa 3488(%rdi), %ymm1
16798 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16799 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
16800 ; AVX2-FCP-NEXT: vmovdqa 3392(%rdi), %xmm1
16801 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16802 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
16803 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16804 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16805 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm0
16806 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16807 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm15[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm15[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
16808 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
16809 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
16810 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16811 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16812 ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm0
16813 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16814 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm13[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm13[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
16815 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm1
16816 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16817 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
16818 ; AVX2-FCP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
16819 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16820 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16821 ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %xmm0
16822 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
16823 ; AVX2-FCP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
16824 ; AVX2-FCP-NEXT: vmovdqa 640(%rdi), %ymm1
16825 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16826 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm12[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm12[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
16827 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16828 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16829 ; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %ymm0
16830 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16831 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm11[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm11[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
16832 ; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %xmm1
16833 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16834 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
16835 ; AVX2-FCP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
16836 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16837 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16838 ; AVX2-FCP-NEXT: vmovdqa 992(%rdi), %xmm0
16839 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
16840 ; AVX2-FCP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
16841 ; AVX2-FCP-NEXT: vmovdqa 1088(%rdi), %ymm1
16842 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16843 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm10[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],ymm10[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
16844 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16845 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16846 ; AVX2-FCP-NEXT: vmovdqa 1312(%rdi), %ymm0
16847 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16848 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm9[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm9[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
16849 ; AVX2-FCP-NEXT: vmovdqa 1216(%rdi), %xmm1
16850 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16851 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
16852 ; AVX2-FCP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
16853 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16854 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16855 ; AVX2-FCP-NEXT: vmovdqa 1440(%rdi), %xmm0
16856 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
16857 ; AVX2-FCP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
16858 ; AVX2-FCP-NEXT: vmovdqa 1536(%rdi), %ymm14
16859 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm8[8,9,10,11,12,13,14,15],ymm14[0,1,2,3,4,5,6,7],ymm8[24,25,26,27,28,29,30,31],ymm14[16,17,18,19,20,21,22,23]
16860 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16861 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16862 ; AVX2-FCP-NEXT: vmovdqa 1760(%rdi), %ymm0
16863 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16864 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm7[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],ymm7[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
16865 ; AVX2-FCP-NEXT: vmovdqa 1664(%rdi), %xmm13
16866 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload
16867 ; AVX2-FCP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm13[0,1,2,3,4,5,6,7]
16868 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16869 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16870 ; AVX2-FCP-NEXT: vmovdqa 1888(%rdi), %xmm0
16871 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
16872 ; AVX2-FCP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
16873 ; AVX2-FCP-NEXT: vmovdqa 1984(%rdi), %ymm10
16874 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm6[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],ymm6[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
16875 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16876 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16877 ; AVX2-FCP-NEXT: vmovdqa 2208(%rdi), %ymm12
16878 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm0 = ymm5[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],ymm5[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
16879 ; AVX2-FCP-NEXT: vmovdqa 2112(%rdi), %xmm11
16880 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
16881 ; AVX2-FCP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm11[0,1,2,3,4,5,6,7]
16882 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16883 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16884 ; AVX2-FCP-NEXT: vmovdqa 2336(%rdi), %xmm0
16885 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
16886 ; AVX2-FCP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
16887 ; AVX2-FCP-NEXT: vmovdqa 2432(%rdi), %ymm8
16888 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm3[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],ymm3[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
16889 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16890 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16891 ; AVX2-FCP-NEXT: vmovdqa 2656(%rdi), %ymm9
16892 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm0 # 32-byte Folded Reload
16893 ; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
16894 ; AVX2-FCP-NEXT: vmovdqa 2560(%rdi), %xmm7
16895 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm1 # 16-byte Folded Reload
16896 ; AVX2-FCP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm7[0,1,2,3,4,5,6,7]
16897 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16898 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16899 ; AVX2-FCP-NEXT: vmovdqa 2784(%rdi), %xmm0
16900 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
16901 ; AVX2-FCP-NEXT: # xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
16902 ; AVX2-FCP-NEXT: vmovdqa 2880(%rdi), %ymm4
16903 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} ymm1 = ymm2[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],ymm2[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
16904 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16905 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16906 ; AVX2-FCP-NEXT: vmovdqa 3104(%rdi), %ymm6
16907 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
16908 ; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
16909 ; AVX2-FCP-NEXT: vmovdqa 3008(%rdi), %xmm5
16910 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
16911 ; AVX2-FCP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm5[0,1,2,3,4,5,6,7]
16912 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
16913 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16914 ; AVX2-FCP-NEXT: vmovdqa 3232(%rdi), %xmm0
16915 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
16916 ; AVX2-FCP-NEXT: # xmm1 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
16917 ; AVX2-FCP-NEXT: vmovdqa 3328(%rdi), %ymm3
16918 ; AVX2-FCP-NEXT: vpalignr $8, (%rsp), %ymm3, %ymm2 # 32-byte Folded Reload
16919 ; AVX2-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
16920 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7]
16921 ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
16922 ; AVX2-FCP-NEXT: vmovdqa 3552(%rdi), %ymm2
16923 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
16924 ; AVX2-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
16925 ; AVX2-FCP-NEXT: vmovdqa 3456(%rdi), %xmm1
16926 ; AVX2-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload
16927 ; AVX2-FCP-NEXT: # xmm15 = mem[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
16928 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
16929 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16930 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
16931 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16932 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16933 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
16934 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3]
16935 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
16936 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16937 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
16938 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16939 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16940 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
16941 ; AVX2-FCP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
16942 ; AVX2-FCP-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
16943 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
16944 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16945 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
16946 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16947 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16948 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
16949 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3]
16950 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
16951 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16952 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
16953 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16954 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16955 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
16956 ; AVX2-FCP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
16957 ; AVX2-FCP-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
16958 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
16959 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16960 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
16961 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16962 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16963 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
16964 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0,1],mem[2,3]
16965 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
16966 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16967 ; AVX2-FCP-NEXT: vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
16968 ; AVX2-FCP-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16969 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16970 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
16971 ; AVX2-FCP-NEXT: vblendps $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
16972 ; AVX2-FCP-NEXT: # xmm15 = mem[0,1],xmm15[2,3]
16973 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
16974 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16975 ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
16976 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm14[1],ymm0[3],ymm14[3]
16977 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
16978 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0,1],mem[2,3]
16979 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
16980 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16981 ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
16982 ; AVX2-FCP-NEXT: vpunpckhqdq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
16983 ; AVX2-FCP-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
16984 ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
16985 ; AVX2-FCP-NEXT: # xmm13 = mem[0,1],xmm13[2,3]
16986 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm13[0,1,2,3],ymm0[4,5,6,7]
16987 ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
16988 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm10[1],ymm0[3],ymm10[3]
16989 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
16990 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0,1],mem[2,3]
16991 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm10[0,1,2,3],ymm0[4,5,6,7]
16992 ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
16993 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm12[1],ymm0[3],ymm12[3]
16994 ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm10 # 16-byte Folded Reload
16995 ; AVX2-FCP-NEXT: # xmm10 = mem[0,1],xmm11[2,3]
16996 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm10[0,1,2,3],ymm0[4,5,6,7]
16997 ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
16998 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm8[1],ymm0[3],ymm8[3]
16999 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
17000 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],mem[2,3]
17001 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm8[0,1,2,3],ymm0[4,5,6,7]
17002 ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
17003 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm9[1],ymm0[3],ymm9[3]
17004 ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm7 # 16-byte Folded Reload
17005 ; AVX2-FCP-NEXT: # xmm7 = mem[0,1],xmm7[2,3]
17006 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm0[4,5,6,7]
17007 ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
17008 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm4[1],ymm0[3],ymm4[3]
17009 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
17010 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],mem[2,3]
17011 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm4[0,1,2,3],ymm0[4,5,6,7]
17012 ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
17013 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm6[1],ymm0[3],ymm6[3]
17014 ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm4 # 16-byte Folded Reload
17015 ; AVX2-FCP-NEXT: # xmm4 = mem[0,1],xmm5[2,3]
17016 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm0[4,5,6,7]
17017 ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
17018 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm3[1],ymm0[3],ymm3[3]
17019 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
17020 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],mem[2,3]
17021 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
17022 ; AVX2-FCP-NEXT: vpbroadcastq {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 16-byte Folded Reload
17023 ; AVX2-FCP-NEXT: vpunpckhqdq {{.*#+}} ymm2 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
17024 ; AVX2-FCP-NEXT: vpblendd $3, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
17025 ; AVX2-FCP-NEXT: # xmm1 = mem[0,1],xmm1[2,3]
17026 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm2[4,5,6,7]
17027 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17028 ; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%rsi)
17029 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17030 ; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%rsi)
17031 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17032 ; AVX2-FCP-NEXT: vmovaps %ymm1, 320(%rsi)
17033 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17034 ; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%rsi)
17035 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17036 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rsi)
17037 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17038 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rsi)
17039 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17040 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rsi)
17041 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17042 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi)
17043 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17044 ; AVX2-FCP-NEXT: vmovaps %ymm1, 480(%rsi)
17045 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17046 ; AVX2-FCP-NEXT: vmovaps %ymm1, 416(%rsi)
17047 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17048 ; AVX2-FCP-NEXT: vmovaps %ymm1, 352(%rsi)
17049 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17050 ; AVX2-FCP-NEXT: vmovaps %ymm1, 288(%rsi)
17051 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17052 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rsi)
17053 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17054 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rsi)
17055 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17056 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rsi)
17057 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17058 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rsi)
17059 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17060 ; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%rdx)
17061 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17062 ; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%rdx)
17063 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17064 ; AVX2-FCP-NEXT: vmovaps %ymm1, 320(%rdx)
17065 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17066 ; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%rdx)
17067 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17068 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rdx)
17069 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17070 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rdx)
17071 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17072 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rdx)
17073 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17074 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx)
17075 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17076 ; AVX2-FCP-NEXT: vmovaps %ymm1, 480(%rdx)
17077 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17078 ; AVX2-FCP-NEXT: vmovaps %ymm1, 416(%rdx)
17079 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17080 ; AVX2-FCP-NEXT: vmovaps %ymm1, 352(%rdx)
17081 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17082 ; AVX2-FCP-NEXT: vmovaps %ymm1, 288(%rdx)
17083 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17084 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rdx)
17085 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17086 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rdx)
17087 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17088 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rdx)
17089 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17090 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rdx)
17091 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17092 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rcx)
17093 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17094 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rcx)
17095 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17096 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rcx)
17097 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17098 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rcx)
17099 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17100 ; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%rcx)
17101 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17102 ; AVX2-FCP-NEXT: vmovaps %ymm1, 320(%rcx)
17103 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17104 ; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%rcx)
17105 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17106 ; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%rcx)
17107 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17108 ; AVX2-FCP-NEXT: vmovaps %ymm1, 480(%rcx)
17109 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17110 ; AVX2-FCP-NEXT: vmovaps %ymm1, 416(%rcx)
17111 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17112 ; AVX2-FCP-NEXT: vmovaps %ymm1, 352(%rcx)
17113 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17114 ; AVX2-FCP-NEXT: vmovaps %ymm1, 288(%rcx)
17115 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17116 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rcx)
17117 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17118 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rcx)
17119 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17120 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rcx)
17121 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17122 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rcx)
17123 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17124 ; AVX2-FCP-NEXT: vmovaps %ymm1, 480(%r8)
17125 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17126 ; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%r8)
17127 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17128 ; AVX2-FCP-NEXT: vmovaps %ymm1, 416(%r8)
17129 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17130 ; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%r8)
17131 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17132 ; AVX2-FCP-NEXT: vmovaps %ymm1, 352(%r8)
17133 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17134 ; AVX2-FCP-NEXT: vmovaps %ymm1, 320(%r8)
17135 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17136 ; AVX2-FCP-NEXT: vmovaps %ymm1, 288(%r8)
17137 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17138 ; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%r8)
17139 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17140 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%r8)
17141 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17142 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%r8)
17143 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17144 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%r8)
17145 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17146 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%r8)
17147 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17148 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r8)
17149 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17150 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%r8)
17151 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17152 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r8)
17153 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17154 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%r8)
17155 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17156 ; AVX2-FCP-NEXT: vmovaps %ymm1, 480(%r9)
17157 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17158 ; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%r9)
17159 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17160 ; AVX2-FCP-NEXT: vmovaps %ymm1, 416(%r9)
17161 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17162 ; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%r9)
17163 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17164 ; AVX2-FCP-NEXT: vmovaps %ymm1, 352(%r9)
17165 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17166 ; AVX2-FCP-NEXT: vmovaps %ymm1, 320(%r9)
17167 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17168 ; AVX2-FCP-NEXT: vmovaps %ymm1, 288(%r9)
17169 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17170 ; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%r9)
17171 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17172 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%r9)
17173 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17174 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%r9)
17175 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17176 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%r9)
17177 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17178 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%r9)
17179 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17180 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%r9)
17181 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17182 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%r9)
17183 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17184 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r9)
17185 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17186 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%r9)
17187 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
17188 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17189 ; AVX2-FCP-NEXT: vmovaps %ymm1, 480(%rax)
17190 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
17191 ; AVX2-FCP-NEXT: vmovaps %ymm1, 448(%rax)
17192 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17193 ; AVX2-FCP-NEXT: vmovaps %ymm1, 416(%rax)
17194 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17195 ; AVX2-FCP-NEXT: vmovaps %ymm1, 384(%rax)
17196 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17197 ; AVX2-FCP-NEXT: vmovaps %ymm1, 352(%rax)
17198 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17199 ; AVX2-FCP-NEXT: vmovaps %ymm1, 320(%rax)
17200 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17201 ; AVX2-FCP-NEXT: vmovaps %ymm1, 288(%rax)
17202 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17203 ; AVX2-FCP-NEXT: vmovaps %ymm1, 256(%rax)
17204 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17205 ; AVX2-FCP-NEXT: vmovaps %ymm1, 224(%rax)
17206 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17207 ; AVX2-FCP-NEXT: vmovaps %ymm1, 192(%rax)
17208 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17209 ; AVX2-FCP-NEXT: vmovaps %ymm1, 160(%rax)
17210 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17211 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rax)
17212 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17213 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rax)
17214 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17215 ; AVX2-FCP-NEXT: vmovaps %ymm1, 64(%rax)
17216 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17217 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rax)
17218 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
17219 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rax)
17220 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
17221 ; AVX2-FCP-NEXT: vmovdqa %ymm14, 480(%rax)
17222 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 448(%rax)
17223 ; AVX2-FCP-NEXT: vmovdqa %ymm4, 416(%rax)
17224 ; AVX2-FCP-NEXT: vmovdqa %ymm8, 384(%rax)
17225 ; AVX2-FCP-NEXT: vmovdqa %ymm7, 352(%rax)
17226 ; AVX2-FCP-NEXT: vmovdqa %ymm10, 320(%rax)
17227 ; AVX2-FCP-NEXT: vmovdqa %ymm11, 288(%rax)
17228 ; AVX2-FCP-NEXT: vmovdqa %ymm13, 256(%rax)
17229 ; AVX2-FCP-NEXT: vmovdqa %ymm15, 224(%rax)
17230 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17231 ; AVX2-FCP-NEXT: vmovaps %ymm0, 192(%rax)
17232 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17233 ; AVX2-FCP-NEXT: vmovaps %ymm0, 160(%rax)
17234 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17235 ; AVX2-FCP-NEXT: vmovaps %ymm0, 128(%rax)
17236 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17237 ; AVX2-FCP-NEXT: vmovaps %ymm0, 96(%rax)
17238 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17239 ; AVX2-FCP-NEXT: vmovaps %ymm0, 64(%rax)
17240 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17241 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax)
17242 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
17243 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax)
17244 ; AVX2-FCP-NEXT: addq $3928, %rsp # imm = 0xF58
17245 ; AVX2-FCP-NEXT: vzeroupper
17246 ; AVX2-FCP-NEXT: retq
17248 ; AVX512-LABEL: load_i64_stride7_vf64:
17250 ; AVX512-NEXT: subq $7624, %rsp # imm = 0x1DC8
17251 ; AVX512-NEXT: vmovdqa64 3328(%rdi), %zmm16
17252 ; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17253 ; AVX512-NEXT: vmovdqa64 3264(%rdi), %zmm8
17254 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17255 ; AVX512-NEXT: vmovdqa64 3008(%rdi), %zmm19
17256 ; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17257 ; AVX512-NEXT: vmovdqa64 2944(%rdi), %zmm20
17258 ; AVX512-NEXT: vmovdqa64 2880(%rdi), %zmm2
17259 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17260 ; AVX512-NEXT: vmovdqa64 2816(%rdi), %zmm1
17261 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17262 ; AVX512-NEXT: vmovdqa64 2752(%rdi), %zmm18
17263 ; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17264 ; AVX512-NEXT: vmovdqa64 2688(%rdi), %zmm7
17265 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17266 ; AVX512-NEXT: vmovdqa64 2432(%rdi), %zmm17
17267 ; AVX512-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17268 ; AVX512-NEXT: vmovdqa64 2368(%rdi), %zmm9
17269 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17270 ; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm11
17271 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17272 ; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm3
17273 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17274 ; AVX512-NEXT: vmovdqa64 1536(%rdi), %zmm12
17275 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17276 ; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm13
17277 ; AVX512-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
17278 ; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm14
17279 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17280 ; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm4
17281 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17282 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm10
17283 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17284 ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm6
17285 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17286 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm15
17287 ; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17288 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm5
17289 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17290 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
17291 ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
17292 ; AVX512-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
17293 ; AVX512-NEXT: vmovdqa 2704(%rdi), %xmm2
17294 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
17295 ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
17296 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17297 ; AVX512-NEXT: vpermt2q %zmm10, %zmm0, %zmm6
17298 ; AVX512-NEXT: vmovdqa 464(%rdi), %xmm2
17299 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
17300 ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1
17301 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17302 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1
17303 ; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm1
17304 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm2
17305 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
17306 ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
17307 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17308 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1
17309 ; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm1
17310 ; AVX512-NEXT: vmovdqa 1360(%rdi), %xmm2
17311 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
17312 ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
17313 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17314 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1
17315 ; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm1
17316 ; AVX512-NEXT: vmovdqa 912(%rdi), %xmm2
17317 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
17318 ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
17319 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17320 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1
17321 ; AVX512-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
17322 ; AVX512-NEXT: vmovdqa 2256(%rdi), %xmm2
17323 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
17324 ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
17325 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17326 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1
17327 ; AVX512-NEXT: vpermt2q %zmm11, %zmm0, %zmm1
17328 ; AVX512-NEXT: vmovdqa 1808(%rdi), %xmm2
17329 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
17330 ; AVX512-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
17331 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17332 ; AVX512-NEXT: vpermi2q %zmm16, %zmm8, %zmm0
17333 ; AVX512-NEXT: vmovdqa 3152(%rdi), %xmm1
17334 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
17335 ; AVX512-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
17336 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17337 ; AVX512-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1
17338 ; AVX512-NEXT: vmovdqa 2816(%rdi), %ymm0
17339 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17340 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
17341 ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
17342 ; AVX512-NEXT: vpermt2q %zmm18, %zmm9, %zmm7
17343 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
17344 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0]
17345 ; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
17346 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm12
17347 ; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17348 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm2
17349 ; AVX512-NEXT: vpermt2q %zmm19, %zmm11, %zmm2
17350 ; AVX512-NEXT: vmovdqa64 3072(%rdi), %zmm3
17351 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17352 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
17353 ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
17354 ; AVX512-NEXT: vpermt2q %zmm3, %zmm4, %zmm2
17355 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
17356 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17357 ; AVX512-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2
17358 ; AVX512-NEXT: vmovdqa 576(%rdi), %ymm0
17359 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17360 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
17361 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm16
17362 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm3
17363 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17364 ; AVX512-NEXT: vpermt2q %zmm16, %zmm9, %zmm3
17365 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
17366 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm31
17367 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm3
17368 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17369 ; AVX512-NEXT: vpermt2q %zmm31, %zmm11, %zmm3
17370 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm5
17371 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17372 ; AVX512-NEXT: vpermt2q %zmm5, %zmm4, %zmm3
17373 ; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
17374 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17375 ; AVX512-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm3
17376 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm2
17377 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17378 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7]
17379 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm23
17380 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm2
17381 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5
17382 ; AVX512-NEXT: vpermt2q %zmm23, %zmm9, %zmm5
17383 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
17384 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm13
17385 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm6
17386 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm5
17387 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm24
17388 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17389 ; AVX512-NEXT: vpermt2q %zmm13, %zmm11, %zmm5
17390 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm6
17391 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17392 ; AVX512-NEXT: vpermt2q %zmm6, %zmm4, %zmm5
17393 ; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
17394 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17395 ; AVX512-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm5
17396 ; AVX512-NEXT: vmovdqa 1472(%rdi), %ymm3
17397 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17398 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7]
17399 ; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm19
17400 ; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm7
17401 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm6
17402 ; AVX512-NEXT: vpermt2q %zmm19, %zmm9, %zmm6
17403 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7]
17404 ; AVX512-NEXT: vmovdqa64 1664(%rdi), %zmm17
17405 ; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm8
17406 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17407 ; AVX512-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
17408 ; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm3
17409 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17410 ; AVX512-NEXT: vpermt2q %zmm3, %zmm4, %zmm8
17411 ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm5
17412 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17413 ; AVX512-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm6
17414 ; AVX512-NEXT: vmovdqa 1024(%rdi), %ymm3
17415 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17416 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7]
17417 ; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm30
17418 ; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm1
17419 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8
17420 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17421 ; AVX512-NEXT: vpermt2q %zmm30, %zmm9, %zmm8
17422 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7]
17423 ; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm29
17424 ; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm10
17425 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17426 ; AVX512-NEXT: vpermt2q %zmm29, %zmm11, %zmm10
17427 ; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm3
17428 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17429 ; AVX512-NEXT: vpermt2q %zmm3, %zmm4, %zmm10
17430 ; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm5
17431 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17432 ; AVX512-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm8
17433 ; AVX512-NEXT: vmovdqa 2368(%rdi), %ymm3
17434 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17435 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm8[6,7]
17436 ; AVX512-NEXT: vmovdqa64 2304(%rdi), %zmm18
17437 ; AVX512-NEXT: vmovdqa64 2240(%rdi), %zmm0
17438 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10
17439 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17440 ; AVX512-NEXT: vpermt2q %zmm18, %zmm9, %zmm10
17441 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7]
17442 ; AVX512-NEXT: vmovdqa64 2560(%rdi), %zmm20
17443 ; AVX512-NEXT: vmovdqa64 2496(%rdi), %zmm14
17444 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17445 ; AVX512-NEXT: vpermt2q %zmm20, %zmm11, %zmm14
17446 ; AVX512-NEXT: vmovdqa64 2624(%rdi), %zmm21
17447 ; AVX512-NEXT: vpermt2q %zmm21, %zmm4, %zmm14
17448 ; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17449 ; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm5
17450 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17451 ; AVX512-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm10
17452 ; AVX512-NEXT: vmovdqa 1920(%rdi), %ymm3
17453 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17454 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7]
17455 ; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm22
17456 ; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm5
17457 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm15
17458 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm6
17459 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17460 ; AVX512-NEXT: vpermt2q %zmm22, %zmm9, %zmm15
17461 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
17462 ; AVX512-NEXT: vmovdqa64 2112(%rdi), %zmm26
17463 ; AVX512-NEXT: vmovdqa64 2048(%rdi), %zmm15
17464 ; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17465 ; AVX512-NEXT: vpermt2q %zmm26, %zmm11, %zmm15
17466 ; AVX512-NEXT: vmovdqa64 2176(%rdi), %zmm28
17467 ; AVX512-NEXT: vpermt2q %zmm28, %zmm4, %zmm15
17468 ; AVX512-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17469 ; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm5
17470 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17471 ; AVX512-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm10
17472 ; AVX512-NEXT: vmovdqa 3264(%rdi), %ymm3
17473 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
17474 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7]
17475 ; AVX512-NEXT: vmovdqa64 3200(%rdi), %zmm25
17476 ; AVX512-NEXT: vmovdqa64 3136(%rdi), %zmm14
17477 ; AVX512-NEXT: vpermi2q %zmm25, %zmm14, %zmm9
17478 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7]
17479 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5
17480 ; AVX512-NEXT: vpermt2q %zmm23, %zmm11, %zmm5
17481 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17482 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17483 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm5
17484 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm27
17485 ; AVX512-NEXT: vpermt2q %zmm16, %zmm11, %zmm5
17486 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17487 ; AVX512-NEXT: vpermt2q %zmm30, %zmm11, %zmm1
17488 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17489 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm3
17490 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm5
17491 ; AVX512-NEXT: vpermt2q %zmm19, %zmm11, %zmm5
17492 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17493 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm5
17494 ; AVX512-NEXT: vpermt2q %zmm22, %zmm11, %zmm5
17495 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17496 ; AVX512-NEXT: vpermt2q %zmm18, %zmm11, %zmm0
17497 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17498 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
17499 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm9
17500 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
17501 ; AVX512-NEXT: vpermt2q %zmm7, %zmm11, %zmm9
17502 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17503 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm9
17504 ; AVX512-NEXT: vpermt2q %zmm25, %zmm11, %zmm9
17505 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17506 ; AVX512-NEXT: vmovdqa64 3456(%rdi), %zmm9
17507 ; AVX512-NEXT: vmovdqa64 3392(%rdi), %zmm0
17508 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17509 ; AVX512-NEXT: vpermi2q %zmm9, %zmm0, %zmm11
17510 ; AVX512-NEXT: vmovdqa64 3520(%rdi), %zmm6
17511 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17512 ; AVX512-NEXT: vpermt2q %zmm6, %zmm4, %zmm11
17513 ; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm4
17514 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17515 ; AVX512-NEXT: vmovdqa 2880(%rdi), %ymm4
17516 ; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
17517 ; AVX512-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
17518 ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,12]
17519 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm4
17520 ; AVX512-NEXT: vpermt2q %zmm7, %zmm11, %zmm4
17521 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7]
17522 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [9,0,7,0,9,0,7,0]
17523 ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
17524 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
17525 ; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm16
17526 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
17527 ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
17528 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm16 # 64-byte Folded Reload
17529 ; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm5
17530 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17531 ; AVX512-NEXT: vmovdqa 640(%rdi), %ymm10
17532 ; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload
17533 ; AVX512-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
17534 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10
17535 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm6
17536 ; AVX512-NEXT: vpermt2q %zmm27, %zmm11, %zmm10
17537 ; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17538 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7]
17539 ; AVX512-NEXT: vmovdqa64 %zmm31, %zmm10
17540 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
17541 ; AVX512-NEXT: vpermt2q %zmm16, %zmm0, %zmm10
17542 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload
17543 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1
17544 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17545 ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm1
17546 ; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
17547 ; AVX512-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
17548 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5
17549 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17550 ; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17551 ; AVX512-NEXT: vpermt2q %zmm23, %zmm11, %zmm2
17552 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
17553 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm2
17554 ; AVX512-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
17555 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
17556 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
17557 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17558 ; AVX512-NEXT: vmovdqa 1536(%rdi), %ymm1
17559 ; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
17560 ; AVX512-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
17561 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm8
17562 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17563 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2
17564 ; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17565 ; AVX512-NEXT: vpermt2q %zmm19, %zmm11, %zmm2
17566 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
17567 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm2
17568 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
17569 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
17570 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
17571 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17572 ; AVX512-NEXT: vmovdqa 1088(%rdi), %ymm1
17573 ; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
17574 ; AVX512-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
17575 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
17576 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm2
17577 ; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17578 ; AVX512-NEXT: vpermt2q %zmm30, %zmm11, %zmm2
17579 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
17580 ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm2
17581 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
17582 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
17583 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
17584 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17585 ; AVX512-NEXT: vmovdqa 2432(%rdi), %ymm1
17586 ; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
17587 ; AVX512-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
17588 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17589 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm2
17590 ; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17591 ; AVX512-NEXT: vpermt2q %zmm18, %zmm11, %zmm2
17592 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
17593 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm2
17594 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
17595 ; AVX512-NEXT: vpermt2q %zmm21, %zmm4, %zmm2
17596 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
17597 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17598 ; AVX512-NEXT: vmovdqa 1984(%rdi), %ymm1
17599 ; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
17600 ; AVX512-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
17601 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17602 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm2
17603 ; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17604 ; AVX512-NEXT: vpermt2q %zmm22, %zmm11, %zmm2
17605 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
17606 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm2
17607 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
17608 ; AVX512-NEXT: vpermt2q %zmm28, %zmm4, %zmm2
17609 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
17610 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17611 ; AVX512-NEXT: vmovdqa 3328(%rdi), %ymm1
17612 ; AVX512-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
17613 ; AVX512-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
17614 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17615 ; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17616 ; AVX512-NEXT: vpermi2q %zmm25, %zmm14, %zmm11
17617 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
17618 ; AVX512-NEXT: vpermt2q %zmm15, %zmm0, %zmm7
17619 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17620 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm2
17621 ; AVX512-NEXT: vpermt2q %zmm6, %zmm0, %zmm2
17622 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17623 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm2
17624 ; AVX512-NEXT: vpermt2q %zmm5, %zmm0, %zmm2
17625 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17626 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm2
17627 ; AVX512-NEXT: vpermt2q %zmm8, %zmm0, %zmm2
17628 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17629 ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm2
17630 ; AVX512-NEXT: vpermt2q %zmm10, %zmm0, %zmm2
17631 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17632 ; AVX512-NEXT: vpermt2q %zmm3, %zmm0, %zmm18
17633 ; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17634 ; AVX512-NEXT: vpermt2q %zmm12, %zmm0, %zmm22
17635 ; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17636 ; AVX512-NEXT: vpermt2q %zmm14, %zmm0, %zmm25
17637 ; AVX512-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17638 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17639 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm25
17640 ; AVX512-NEXT: vpermi2q %zmm6, %zmm9, %zmm0
17641 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload
17642 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
17643 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17644 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3]
17645 ; AVX512-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17646 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm2
17647 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm12
17648 ; AVX512-NEXT: vpermt2q %zmm31, %zmm23, %zmm12
17649 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [11,4,11,4,11,4,11,4]
17650 ; AVX512-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17651 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0
17652 ; AVX512-NEXT: vpermt2q %zmm31, %zmm24, %zmm0
17653 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17654 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [12,5,12,5,12,5,12,5]
17655 ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17656 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm0
17657 ; AVX512-NEXT: vpermt2q %zmm31, %zmm8, %zmm0
17658 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17659 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,6,13,6,13,6,13,6]
17660 ; AVX512-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17661 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm0
17662 ; AVX512-NEXT: vpermt2q %zmm31, %zmm16, %zmm0
17663 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17664 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
17665 ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17666 ; AVX512-NEXT: vpermt2q %zmm31, %zmm1, %zmm2
17667 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17668 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
17669 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0
17670 ; AVX512-NEXT: vpermt2q %zmm13, %zmm23, %zmm0
17671 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10
17672 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0
17673 ; AVX512-NEXT: vpermt2q %zmm13, %zmm24, %zmm0
17674 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17675 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0
17676 ; AVX512-NEXT: vpermt2q %zmm13, %zmm8, %zmm0
17677 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17678 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm0
17679 ; AVX512-NEXT: vpermt2q %zmm13, %zmm16, %zmm0
17680 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17681 ; AVX512-NEXT: vpermt2q %zmm13, %zmm1, %zmm28
17682 ; AVX512-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17683 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
17684 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0
17685 ; AVX512-NEXT: vpermt2q %zmm17, %zmm23, %zmm0
17686 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm14
17687 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0
17688 ; AVX512-NEXT: vpermt2q %zmm17, %zmm24, %zmm0
17689 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm18
17690 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0
17691 ; AVX512-NEXT: vpermt2q %zmm17, %zmm8, %zmm0
17692 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17693 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm0
17694 ; AVX512-NEXT: vpermt2q %zmm17, %zmm16, %zmm0
17695 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17696 ; AVX512-NEXT: vpermt2q %zmm17, %zmm1, %zmm22
17697 ; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17698 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
17699 ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm9
17700 ; AVX512-NEXT: vpermt2q %zmm29, %zmm23, %zmm9
17701 ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm0
17702 ; AVX512-NEXT: vpermt2q %zmm29, %zmm24, %zmm0
17703 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm19
17704 ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm0
17705 ; AVX512-NEXT: vpermt2q %zmm29, %zmm8, %zmm0
17706 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17707 ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm0
17708 ; AVX512-NEXT: vpermt2q %zmm29, %zmm16, %zmm0
17709 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17710 ; AVX512-NEXT: vpermt2q %zmm29, %zmm1, %zmm30
17711 ; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17712 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17713 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7
17714 ; AVX512-NEXT: vpermt2q %zmm20, %zmm23, %zmm7
17715 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3
17716 ; AVX512-NEXT: vpermt2q %zmm20, %zmm24, %zmm3
17717 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm21
17718 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3
17719 ; AVX512-NEXT: vpermt2q %zmm20, %zmm8, %zmm3
17720 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17721 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm3
17722 ; AVX512-NEXT: vpermt2q %zmm20, %zmm16, %zmm3
17723 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17724 ; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm0
17725 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17726 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
17727 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm30
17728 ; AVX512-NEXT: vpermt2q %zmm26, %zmm23, %zmm30
17729 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0
17730 ; AVX512-NEXT: vpermt2q %zmm26, %zmm24, %zmm0
17731 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22
17732 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm5
17733 ; AVX512-NEXT: vpermt2q %zmm26, %zmm8, %zmm5
17734 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm0
17735 ; AVX512-NEXT: vpermt2q %zmm26, %zmm16, %zmm0
17736 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm29
17737 ; AVX512-NEXT: vpermt2q %zmm26, %zmm1, %zmm11
17738 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
17739 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm28
17740 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
17741 ; AVX512-NEXT: vpermt2q %zmm20, %zmm23, %zmm28
17742 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm31
17743 ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm3
17744 ; AVX512-NEXT: vpermi2q %zmm25, %zmm6, %zmm23
17745 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17746 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
17747 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17748 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm27
17749 ; AVX512-NEXT: vpermt2q %zmm20, %zmm24, %zmm27
17750 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17751 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
17752 ; AVX512-NEXT: vpermt2q %zmm13, %zmm16, %zmm0
17753 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17754 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17755 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
17756 ; AVX512-NEXT: vpermt2q %zmm17, %zmm16, %zmm0
17757 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17758 ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
17759 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
17760 ; AVX512-NEXT: vpermt2q %zmm26, %zmm16, %zmm0
17761 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17762 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17763 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
17764 ; AVX512-NEXT: vpermt2q %zmm4, %zmm16, %zmm0
17765 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17766 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17767 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
17768 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17769 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17770 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
17771 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17772 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17773 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
17774 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17775 ; AVX512-NEXT: vpermi2q %zmm25, %zmm31, %zmm24
17776 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm25
17777 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2
17778 ; AVX512-NEXT: vpermt2q %zmm20, %zmm8, %zmm25
17779 ; AVX512-NEXT: vpermi2q %zmm3, %zmm31, %zmm2
17780 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm0
17781 ; AVX512-NEXT: vpermt2q %zmm20, %zmm16, %zmm0
17782 ; AVX512-NEXT: vpermi2q %zmm3, %zmm31, %zmm16
17783 ; AVX512-NEXT: vpermt2q %zmm3, %zmm1, %zmm31
17784 ; AVX512-NEXT: vmovdqa64 %zmm31, %zmm3
17785 ; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm15
17786 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm31
17787 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
17788 ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
17789 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17790 ; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm10
17791 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17792 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17793 ; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
17794 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17795 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
17796 ; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm9
17797 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17798 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
17799 ; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm14
17800 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17801 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
17802 ; AVX512-NEXT: vpermt2q %zmm15, %zmm1, %zmm30
17803 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
17804 ; AVX512-NEXT: vpermt2q %zmm14, %zmm1, %zmm7
17805 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17806 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
17807 ; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm28
17808 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
17809 ; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm23
17810 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
17811 ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
17812 ; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm27
17813 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17814 ; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
17815 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17816 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17817 ; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm12
17818 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17819 ; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm18
17820 ; AVX512-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17821 ; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm19
17822 ; AVX512-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17823 ; AVX512-NEXT: vpermt2q %zmm14, %zmm1, %zmm21
17824 ; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17825 ; AVX512-NEXT: vpermt2q %zmm15, %zmm1, %zmm22
17826 ; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17827 ; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm24
17828 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
17829 ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
17830 ; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm25
17831 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17832 ; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
17833 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17834 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17835 ; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm12
17836 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17837 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17838 ; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm12
17839 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17840 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17841 ; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm12
17842 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17843 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17844 ; AVX512-NEXT: vpermt2q %zmm14, %zmm1, %zmm12
17845 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17846 ; AVX512-NEXT: vpermt2q %zmm15, %zmm1, %zmm5
17847 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17848 ; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm2
17849 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17850 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
17851 ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
17852 ; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm0
17853 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17854 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17855 ; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
17856 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17857 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17858 ; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm12
17859 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17860 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17861 ; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm12
17862 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17863 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17864 ; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm12
17865 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17866 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17867 ; AVX512-NEXT: vpermt2q %zmm14, %zmm1, %zmm12
17868 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17869 ; AVX512-NEXT: vpermt2q %zmm15, %zmm1, %zmm29
17870 ; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17871 ; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm16
17872 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
17873 ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
17874 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
17875 ; AVX512-NEXT: vpermt2q %zmm6, %zmm1, %zmm18
17876 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
17877 ; AVX512-NEXT: vpermt2q %zmm8, %zmm1, %zmm19
17878 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
17879 ; AVX512-NEXT: vpermt2q %zmm9, %zmm1, %zmm21
17880 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
17881 ; AVX512-NEXT: vpermt2q %zmm10, %zmm1, %zmm22
17882 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17883 ; AVX512-NEXT: vpermt2q %zmm14, %zmm1, %zmm2
17884 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17885 ; AVX512-NEXT: vpermt2q %zmm15, %zmm1, %zmm11
17886 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17887 ; AVX512-NEXT: vpermt2q %zmm20, %zmm1, %zmm3
17888 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17889 ; AVX512-NEXT: vpermt2q %zmm7, %zmm1, %zmm31
17890 ; AVX512-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17891 ; AVX512-NEXT: movb $24, %al
17892 ; AVX512-NEXT: kmovw %eax, %k2
17893 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17894 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17895 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[4,5,4,5],zmm13[4,5,4,5]
17896 ; AVX512-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0]
17897 ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
17898 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm20
17899 ; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm20
17900 ; AVX512-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,11]
17901 ; AVX512-NEXT: vpermt2q %zmm13, %zmm3, %zmm0
17902 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17903 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17904 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
17905 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm0[4,5,4,5],zmm17[4,5,4,5]
17906 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm29
17907 ; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm29
17908 ; AVX512-NEXT: vpermt2q %zmm17, %zmm3, %zmm0
17909 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17910 ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
17911 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
17912 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm0[4,5,4,5],zmm26[4,5,4,5]
17913 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm17
17914 ; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm17
17915 ; AVX512-NEXT: vpermt2q %zmm26, %zmm3, %zmm0
17916 ; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
17917 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17918 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17919 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm4[4,5,4,5]
17920 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0
17921 ; AVX512-NEXT: vpermt2q %zmm2, %zmm15, %zmm0
17922 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17923 ; AVX512-NEXT: vpermt2q %zmm4, %zmm3, %zmm2
17924 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17925 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17926 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17927 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
17928 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5]
17929 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2
17930 ; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm2
17931 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17932 ; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
17933 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17934 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17935 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17936 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17937 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5]
17938 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2
17939 ; AVX512-NEXT: vpermt2q %zmm0, %zmm15, %zmm2
17940 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17941 ; AVX512-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
17942 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17943 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17944 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17945 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
17946 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm0[4,5,4,5],zmm2[4,5,4,5]
17947 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
17948 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm26
17949 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17950 ; AVX512-NEXT: vpermt2q %zmm1, %zmm15, %zmm26
17951 ; AVX512-NEXT: vpermi2q %zmm0, %zmm2, %zmm15
17952 ; AVX512-NEXT: vpermt2q %zmm2, %zmm3, %zmm0
17953 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17954 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm0
17955 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
17956 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5]
17957 ; AVX512-NEXT: vpermt2q %zmm0, %zmm3, %zmm1
17958 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17959 ; AVX512-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13]
17960 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17961 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
17962 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17963 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
17964 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17965 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
17966 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17967 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
17968 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
17969 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload
17970 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
17971 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload
17972 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
17973 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload
17974 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17975 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
17976 ; AVX512-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload
17977 ; AVX512-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17978 ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
17979 ; AVX512-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7]
17980 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0
17981 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17982 ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
17983 ; AVX512-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7]
17984 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0
17985 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17986 ; AVX512-NEXT: vpblendd $240, (%rsp), %ymm3, %ymm0 # 32-byte Folded Reload
17987 ; AVX512-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7]
17988 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
17989 ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
17990 ; AVX512-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7]
17991 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
17992 ; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
17993 ; AVX512-NEXT: movb $-32, %al
17994 ; AVX512-NEXT: kmovw %eax, %k1
17995 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17996 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
17997 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17998 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
17999 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
18000 ; AVX512-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18001 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18002 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1}
18003 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18004 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18005 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1}
18006 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18007 ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm6 {%k1}
18008 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18009 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18010 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
18011 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18012 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1}
18013 ; AVX512-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18014 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1}
18015 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18016 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18017 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
18018 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm0 {%k1}
18019 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18020 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18021 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18022 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
18023 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18024 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
18025 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18026 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18027 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18028 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
18029 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18030 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
18031 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18032 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18033 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18034 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
18035 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18036 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
18037 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18038 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18039 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18040 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
18041 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18042 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
18043 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18044 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
18045 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18046 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2}
18047 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18048 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1}
18049 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18050 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18051 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2}
18052 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18053 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
18054 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18055 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18056 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
18057 ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm0 {%k1}
18058 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm18
18059 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
18060 ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1}
18061 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
18062 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18063 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
18064 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
18065 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18066 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
18067 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
18068 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18069 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
18070 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
18071 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18072 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
18073 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
18074 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18075 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
18076 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
18077 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18078 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1}
18079 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
18080 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18081 ; AVX512-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
18082 ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
18083 ; AVX512-NEXT: # ymm0 = ymm11[0,1,2,3],mem[4,5,6,7]
18084 ; AVX512-NEXT: vmovdqa 2752(%rdi), %ymm3
18085 ; AVX512-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
18086 ; AVX512-NEXT: vextracti128 $1, %ymm3, %xmm3
18087 ; AVX512-NEXT: vinserti32x4 $0, %xmm3, %zmm26, %zmm3
18088 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18089 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
18090 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18091 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
18092 ; AVX512-NEXT: vmovdqa 512(%rdi), %ymm1
18093 ; AVX512-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
18094 ; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm1
18095 ; AVX512-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm6
18096 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18097 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1}
18098 ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload
18099 ; AVX512-NEXT: # ymm1 = ymm13[0,1,2,3],mem[4,5,6,7]
18100 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm4
18101 ; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
18102 ; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4
18103 ; AVX512-NEXT: vinserti32x4 $0, %xmm4, %zmm29, %zmm14
18104 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
18105 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1}
18106 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
18107 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18108 ; AVX512-NEXT: vmovdqa 1408(%rdi), %ymm4
18109 ; AVX512-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
18110 ; AVX512-NEXT: vextracti128 $1, %ymm4, %xmm4
18111 ; AVX512-NEXT: vinserti32x4 $0, %xmm4, %zmm17, %zmm5
18112 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
18113 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
18114 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
18115 ; AVX512-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
18116 ; AVX512-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7]
18117 ; AVX512-NEXT: vmovdqa 960(%rdi), %ymm8
18118 ; AVX512-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
18119 ; AVX512-NEXT: vextracti128 $1, %ymm8, %xmm8
18120 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
18121 ; AVX512-NEXT: vinserti32x4 $0, %xmm8, %zmm9, %zmm8
18122 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
18123 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1}
18124 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
18125 ; AVX512-NEXT: vinsertf64x4 $0, %ymm4, %zmm9, %zmm4
18126 ; AVX512-NEXT: vmovdqa 2304(%rdi), %ymm10
18127 ; AVX512-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
18128 ; AVX512-NEXT: vextracti128 $1, %ymm10, %xmm10
18129 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
18130 ; AVX512-NEXT: vinserti32x4 $0, %xmm10, %zmm9, %zmm10
18131 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
18132 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
18133 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
18134 ; AVX512-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload
18135 ; AVX512-NEXT: # ymm11 = mem[0,1,2,3],ymm9[4,5,6,7]
18136 ; AVX512-NEXT: vmovdqa 1856(%rdi), %ymm12
18137 ; AVX512-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
18138 ; AVX512-NEXT: vextracti128 $1, %ymm12, %xmm12
18139 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
18140 ; AVX512-NEXT: vinserti32x4 $0, %xmm12, %zmm9, %zmm12
18141 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
18142 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1}
18143 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
18144 ; AVX512-NEXT: vinsertf64x4 $0, %ymm11, %zmm9, %zmm11
18145 ; AVX512-NEXT: vmovdqa 3200(%rdi), %ymm13
18146 ; AVX512-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
18147 ; AVX512-NEXT: vextracti128 $1, %ymm13, %xmm13
18148 ; AVX512-NEXT: vinserti32x4 $0, %xmm13, %zmm15, %zmm9
18149 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1}
18150 ; AVX512-NEXT: vmovdqa64 %zmm31, 448(%rsi)
18151 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
18152 ; AVX512-NEXT: vmovaps %zmm13, 384(%rsi)
18153 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
18154 ; AVX512-NEXT: vmovaps %zmm13, 320(%rsi)
18155 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
18156 ; AVX512-NEXT: vmovaps %zmm13, 256(%rsi)
18157 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
18158 ; AVX512-NEXT: vmovaps %zmm13, 192(%rsi)
18159 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
18160 ; AVX512-NEXT: vmovaps %zmm13, 128(%rsi)
18161 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
18162 ; AVX512-NEXT: vmovaps %zmm13, 64(%rsi)
18163 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
18164 ; AVX512-NEXT: vmovaps %zmm13, (%rsi)
18165 ; AVX512-NEXT: vmovdqa64 %zmm18, 448(%rdx)
18166 ; AVX512-NEXT: vmovdqa64 %zmm2, 256(%rdx)
18167 ; AVX512-NEXT: vmovdqa64 %zmm23, 320(%rdx)
18168 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18169 ; AVX512-NEXT: vmovaps %zmm2, 128(%rdx)
18170 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18171 ; AVX512-NEXT: vmovaps %zmm2, 192(%rdx)
18172 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18173 ; AVX512-NEXT: vmovaps %zmm2, (%rdx)
18174 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18175 ; AVX512-NEXT: vmovaps %zmm2, 64(%rdx)
18176 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18177 ; AVX512-NEXT: vmovaps %zmm2, 384(%rdx)
18178 ; AVX512-NEXT: vmovdqa64 %zmm25, 448(%rcx)
18179 ; AVX512-NEXT: vmovdqa64 %zmm28, 256(%rcx)
18180 ; AVX512-NEXT: vmovdqa64 %zmm27, 320(%rcx)
18181 ; AVX512-NEXT: vmovdqa64 %zmm30, 128(%rcx)
18182 ; AVX512-NEXT: vmovdqa64 %zmm24, 192(%rcx)
18183 ; AVX512-NEXT: vmovdqa64 %zmm19, (%rcx)
18184 ; AVX512-NEXT: vmovdqa64 %zmm22, 64(%rcx)
18185 ; AVX512-NEXT: vmovdqa64 %zmm7, 384(%rcx)
18186 ; AVX512-NEXT: vmovdqa64 %zmm9, 448(%r8)
18187 ; AVX512-NEXT: vmovdqa64 %zmm12, 256(%r8)
18188 ; AVX512-NEXT: vmovdqa64 %zmm10, 320(%r8)
18189 ; AVX512-NEXT: vmovdqa64 %zmm8, 128(%r8)
18190 ; AVX512-NEXT: vmovdqa64 %zmm5, 192(%r8)
18191 ; AVX512-NEXT: vmovdqa64 %zmm14, (%r8)
18192 ; AVX512-NEXT: vmovdqa64 %zmm6, 64(%r8)
18193 ; AVX512-NEXT: vmovdqa64 %zmm3, 384(%r8)
18194 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18195 ; AVX512-NEXT: vmovaps %zmm3, 448(%r9)
18196 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18197 ; AVX512-NEXT: vmovaps %zmm3, 256(%r9)
18198 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18199 ; AVX512-NEXT: vmovaps %zmm3, 320(%r9)
18200 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18201 ; AVX512-NEXT: vmovaps %zmm3, 128(%r9)
18202 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18203 ; AVX512-NEXT: vmovaps %zmm3, 192(%r9)
18204 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18205 ; AVX512-NEXT: vmovaps %zmm3, (%r9)
18206 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18207 ; AVX512-NEXT: vmovaps %zmm3, 64(%r9)
18208 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18209 ; AVX512-NEXT: vmovaps %zmm3, 384(%r9)
18210 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
18211 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18212 ; AVX512-NEXT: vmovaps %zmm2, 448(%rax)
18213 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18214 ; AVX512-NEXT: vmovaps %zmm2, 256(%rax)
18215 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18216 ; AVX512-NEXT: vmovaps %zmm2, 320(%rax)
18217 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18218 ; AVX512-NEXT: vmovaps %zmm2, 128(%rax)
18219 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18220 ; AVX512-NEXT: vmovaps %zmm2, 192(%rax)
18221 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18222 ; AVX512-NEXT: vmovaps %zmm2, (%rax)
18223 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18224 ; AVX512-NEXT: vmovaps %zmm2, 64(%rax)
18225 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18226 ; AVX512-NEXT: vmovaps %zmm3, 384(%rax)
18227 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
18228 ; AVX512-NEXT: vmovaps %zmm11, 384(%rax)
18229 ; AVX512-NEXT: vmovaps %zmm4, 448(%rax)
18230 ; AVX512-NEXT: vmovdqa64 %zmm1, 256(%rax)
18231 ; AVX512-NEXT: vmovdqa64 %zmm0, 320(%rax)
18232 ; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
18233 ; AVX512-NEXT: vmovaps %zmm0, 128(%rax)
18234 ; AVX512-NEXT: vmovdqa64 %zmm21, 192(%rax)
18235 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18236 ; AVX512-NEXT: vmovaps %zmm0, (%rax)
18237 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18238 ; AVX512-NEXT: vmovaps %zmm0, 64(%rax)
18239 ; AVX512-NEXT: addq $7624, %rsp # imm = 0x1DC8
18240 ; AVX512-NEXT: vzeroupper
18241 ; AVX512-NEXT: retq
18243 ; AVX512-FCP-LABEL: load_i64_stride7_vf64:
18244 ; AVX512-FCP: # %bb.0:
18245 ; AVX512-FCP-NEXT: subq $7624, %rsp # imm = 0x1DC8
18246 ; AVX512-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm16
18247 ; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18248 ; AVX512-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm8
18249 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18250 ; AVX512-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm19
18251 ; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18252 ; AVX512-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm20
18253 ; AVX512-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2
18254 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18255 ; AVX512-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1
18256 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18257 ; AVX512-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm18
18258 ; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18259 ; AVX512-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm7
18260 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18261 ; AVX512-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm17
18262 ; AVX512-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18263 ; AVX512-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm9
18264 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18265 ; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11
18266 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18267 ; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3
18268 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18269 ; AVX512-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm12
18270 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18271 ; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm13
18272 ; AVX512-FCP-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
18273 ; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14
18274 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18275 ; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm4
18276 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18277 ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm10
18278 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18279 ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6
18280 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18281 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm15
18282 ; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18283 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5
18284 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18285 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
18286 ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
18287 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
18288 ; AVX512-FCP-NEXT: vmovdqa 2704(%rdi), %xmm2
18289 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
18290 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
18291 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18292 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm6
18293 ; AVX512-FCP-NEXT: vmovdqa 464(%rdi), %xmm2
18294 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
18295 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1
18296 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18297 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
18298 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1
18299 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
18300 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
18301 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
18302 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18303 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1
18304 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm1
18305 ; AVX512-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2
18306 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
18307 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
18308 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18309 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1
18310 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm1
18311 ; AVX512-FCP-NEXT: vmovdqa 912(%rdi), %xmm2
18312 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
18313 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
18314 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18315 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1
18316 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
18317 ; AVX512-FCP-NEXT: vmovdqa 2256(%rdi), %xmm2
18318 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
18319 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
18320 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18321 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
18322 ; AVX512-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1
18323 ; AVX512-FCP-NEXT: vmovdqa 1808(%rdi), %xmm2
18324 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
18325 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
18326 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18327 ; AVX512-FCP-NEXT: vpermi2q %zmm16, %zmm8, %zmm0
18328 ; AVX512-FCP-NEXT: vmovdqa 3152(%rdi), %xmm1
18329 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
18330 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
18331 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18332 ; AVX512-FCP-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1
18333 ; AVX512-FCP-NEXT: vmovdqa 2816(%rdi), %ymm0
18334 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18335 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
18336 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
18337 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm9, %zmm7
18338 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
18339 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0]
18340 ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
18341 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm12
18342 ; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18343 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm2
18344 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm2
18345 ; AVX512-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm3
18346 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18347 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
18348 ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
18349 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm2
18350 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
18351 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18352 ; AVX512-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2
18353 ; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %ymm0
18354 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18355 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
18356 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm16
18357 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3
18358 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18359 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm3
18360 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
18361 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm31
18362 ; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3
18363 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18364 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm11, %zmm3
18365 ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm5
18366 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18367 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm3
18368 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
18369 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18370 ; AVX512-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm3
18371 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm2
18372 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18373 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7]
18374 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23
18375 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
18376 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
18377 ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm5
18378 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
18379 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13
18380 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm6
18381 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm5
18382 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm24
18383 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18384 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm5
18385 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6
18386 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18387 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm5
18388 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
18389 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18390 ; AVX512-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm5
18391 ; AVX512-FCP-NEXT: vmovdqa 1472(%rdi), %ymm3
18392 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18393 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7]
18394 ; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm19
18395 ; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm7
18396 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm6
18397 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm6
18398 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7]
18399 ; AVX512-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm17
18400 ; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm8
18401 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18402 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
18403 ; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm3
18404 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18405 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm8
18406 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm5
18407 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18408 ; AVX512-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm6
18409 ; AVX512-FCP-NEXT: vmovdqa 1024(%rdi), %ymm3
18410 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18411 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7]
18412 ; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30
18413 ; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm1
18414 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
18415 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18416 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm8
18417 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7]
18418 ; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm29
18419 ; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm10
18420 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18421 ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm10
18422 ; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm3
18423 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18424 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm10
18425 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm5
18426 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18427 ; AVX512-FCP-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm8
18428 ; AVX512-FCP-NEXT: vmovdqa 2368(%rdi), %ymm3
18429 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18430 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm8[6,7]
18431 ; AVX512-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm18
18432 ; AVX512-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm0
18433 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10
18434 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18435 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm9, %zmm10
18436 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7]
18437 ; AVX512-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm20
18438 ; AVX512-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm14
18439 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18440 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm14
18441 ; AVX512-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm21
18442 ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm14
18443 ; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18444 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm5
18445 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18446 ; AVX512-FCP-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm10
18447 ; AVX512-FCP-NEXT: vmovdqa 1920(%rdi), %ymm3
18448 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18449 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7]
18450 ; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm22
18451 ; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm5
18452 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm15
18453 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm6
18454 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18455 ; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm15
18456 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
18457 ; AVX512-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm26
18458 ; AVX512-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm15
18459 ; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18460 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm15
18461 ; AVX512-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm28
18462 ; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm15
18463 ; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18464 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm5
18465 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18466 ; AVX512-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm10
18467 ; AVX512-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3
18468 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
18469 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7]
18470 ; AVX512-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm25
18471 ; AVX512-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm14
18472 ; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm14, %zmm9
18473 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7]
18474 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
18475 ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm11, %zmm5
18476 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18477 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18478 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm5
18479 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm27
18480 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm5
18481 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18482 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm1
18483 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18484 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm3
18485 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm5
18486 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm5
18487 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18488 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm5
18489 ; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm5
18490 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18491 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm0
18492 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18493 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
18494 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm9
18495 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
18496 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm9
18497 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18498 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm9
18499 ; AVX512-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm9
18500 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18501 ; AVX512-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm9
18502 ; AVX512-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm0
18503 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18504 ; AVX512-FCP-NEXT: vpermi2q %zmm9, %zmm0, %zmm11
18505 ; AVX512-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm6
18506 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18507 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm11
18508 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm4
18509 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18510 ; AVX512-FCP-NEXT: vmovdqa 2880(%rdi), %ymm4
18511 ; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
18512 ; AVX512-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
18513 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,12]
18514 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm4
18515 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm4
18516 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7]
18517 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [9,0,7,0,9,0,7,0]
18518 ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
18519 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
18520 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm16
18521 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
18522 ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
18523 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm16 # 64-byte Folded Reload
18524 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm5
18525 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18526 ; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm10
18527 ; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload
18528 ; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
18529 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10
18530 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm6
18531 ; AVX512-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm10
18532 ; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18533 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7]
18534 ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm10
18535 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
18536 ; AVX512-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm10
18537 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload
18538 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1
18539 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18540 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm1
18541 ; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
18542 ; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
18543 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
18544 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18545 ; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18546 ; AVX512-FCP-NEXT: vpermt2q %zmm23, %zmm11, %zmm2
18547 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
18548 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm2
18549 ; AVX512-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
18550 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
18551 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
18552 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18553 ; AVX512-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1
18554 ; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
18555 ; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
18556 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8
18557 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18558 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2
18559 ; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18560 ; AVX512-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm2
18561 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
18562 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm2
18563 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
18564 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
18565 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
18566 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18567 ; AVX512-FCP-NEXT: vmovdqa 1088(%rdi), %ymm1
18568 ; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
18569 ; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
18570 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
18571 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
18572 ; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18573 ; AVX512-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm2
18574 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
18575 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm2
18576 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
18577 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
18578 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
18579 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18580 ; AVX512-FCP-NEXT: vmovdqa 2432(%rdi), %ymm1
18581 ; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
18582 ; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
18583 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18584 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm2
18585 ; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18586 ; AVX512-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm2
18587 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
18588 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm2
18589 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
18590 ; AVX512-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm2
18591 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
18592 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18593 ; AVX512-FCP-NEXT: vmovdqa 1984(%rdi), %ymm1
18594 ; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
18595 ; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
18596 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18597 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm2
18598 ; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18599 ; AVX512-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm2
18600 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
18601 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm2
18602 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
18603 ; AVX512-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm2
18604 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
18605 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18606 ; AVX512-FCP-NEXT: vmovdqa 3328(%rdi), %ymm1
18607 ; AVX512-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
18608 ; AVX512-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
18609 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18610 ; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18611 ; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm14, %zmm11
18612 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
18613 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm7
18614 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18615 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm2
18616 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2
18617 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18618 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm2
18619 ; AVX512-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm2
18620 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18621 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm2
18622 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm2
18623 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18624 ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm2
18625 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm2
18626 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18627 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm18
18628 ; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18629 ; AVX512-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm22
18630 ; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18631 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm25
18632 ; AVX512-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18633 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18634 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm25
18635 ; AVX512-FCP-NEXT: vpermi2q %zmm6, %zmm9, %zmm0
18636 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload
18637 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
18638 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18639 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3]
18640 ; AVX512-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18641 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm2
18642 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm12
18643 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm23, %zmm12
18644 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [11,4,11,4,11,4,11,4]
18645 ; AVX512-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18646 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0
18647 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm24, %zmm0
18648 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18649 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [12,5,12,5,12,5,12,5]
18650 ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18651 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm0
18652 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm8, %zmm0
18653 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18654 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,6,13,6,13,6,13,6]
18655 ; AVX512-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18656 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm0
18657 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm16, %zmm0
18658 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18659 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
18660 ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18661 ; AVX512-FCP-NEXT: vpermt2q %zmm31, %zmm1, %zmm2
18662 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18663 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
18664 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
18665 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm0
18666 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10
18667 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
18668 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm24, %zmm0
18669 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18670 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
18671 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm8, %zmm0
18672 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18673 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
18674 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm0
18675 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18676 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm28
18677 ; AVX512-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18678 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
18679 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
18680 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm0
18681 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm14
18682 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
18683 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm0
18684 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm18
18685 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
18686 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm8, %zmm0
18687 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18688 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
18689 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm0
18690 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18691 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm22
18692 ; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18693 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
18694 ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm9
18695 ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm23, %zmm9
18696 ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
18697 ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm24, %zmm0
18698 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm19
18699 ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
18700 ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm8, %zmm0
18701 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18702 ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
18703 ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm16, %zmm0
18704 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18705 ; AVX512-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm30
18706 ; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18707 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18708 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7
18709 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm7
18710 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3
18711 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm3
18712 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm21
18713 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3
18714 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm3
18715 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18716 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm3
18717 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm3
18718 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18719 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm0
18720 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18721 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
18722 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm30
18723 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm30
18724 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0
18725 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm0
18726 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22
18727 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm5
18728 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm5
18729 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm0
18730 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm0
18731 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm29
18732 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm11
18733 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
18734 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm28
18735 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
18736 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm28
18737 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm31
18738 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm3
18739 ; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm6, %zmm23
18740 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18741 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
18742 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18743 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm27
18744 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm27
18745 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18746 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
18747 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm0
18748 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18749 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18750 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
18751 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm0
18752 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18753 ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
18754 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
18755 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm0
18756 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18757 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18758 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
18759 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm0
18760 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18761 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18762 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
18763 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18764 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18765 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
18766 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18767 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18768 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
18769 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18770 ; AVX512-FCP-NEXT: vpermi2q %zmm25, %zmm31, %zmm24
18771 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm25
18772 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm2
18773 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm25
18774 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm31, %zmm2
18775 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm0
18776 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm0
18777 ; AVX512-FCP-NEXT: vpermi2q %zmm3, %zmm31, %zmm16
18778 ; AVX512-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm31
18779 ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm3
18780 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm15
18781 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm31
18782 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
18783 ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
18784 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18785 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm10
18786 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18787 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18788 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
18789 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18790 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
18791 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm9
18792 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18793 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
18794 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm14
18795 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18796 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
18797 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm30
18798 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
18799 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm7
18800 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18801 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
18802 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm28
18803 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
18804 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm23
18805 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
18806 ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
18807 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm27
18808 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18809 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
18810 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18811 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18812 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12
18813 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18814 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm18
18815 ; AVX512-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18816 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm19
18817 ; AVX512-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18818 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm21
18819 ; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18820 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm22
18821 ; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18822 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm24
18823 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
18824 ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
18825 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25
18826 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18827 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
18828 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18829 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18830 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12
18831 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18832 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18833 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12
18834 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18835 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18836 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm12
18837 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18838 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18839 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm12
18840 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18841 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm5
18842 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18843 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm2
18844 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18845 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
18846 ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
18847 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm0
18848 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18849 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18850 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
18851 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18852 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18853 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12
18854 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18855 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18856 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12
18857 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18858 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18859 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm12
18860 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18861 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18862 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm12
18863 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18864 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm29
18865 ; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18866 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm16
18867 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
18868 ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
18869 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
18870 ; AVX512-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm18
18871 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
18872 ; AVX512-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm19
18873 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
18874 ; AVX512-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm21
18875 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
18876 ; AVX512-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm22
18877 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18878 ; AVX512-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm2
18879 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18880 ; AVX512-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm11
18881 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18882 ; AVX512-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm3
18883 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18884 ; AVX512-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm31
18885 ; AVX512-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18886 ; AVX512-FCP-NEXT: movb $24, %al
18887 ; AVX512-FCP-NEXT: kmovw %eax, %k2
18888 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18889 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18890 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[4,5,4,5],zmm13[4,5,4,5]
18891 ; AVX512-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0]
18892 ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
18893 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm20
18894 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm20
18895 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,11]
18896 ; AVX512-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm0
18897 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18898 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18899 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
18900 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm0[4,5,4,5],zmm17[4,5,4,5]
18901 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm29
18902 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm29
18903 ; AVX512-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm0
18904 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18905 ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
18906 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
18907 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm0[4,5,4,5],zmm26[4,5,4,5]
18908 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm17
18909 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm17
18910 ; AVX512-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm0
18911 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
18912 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18913 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18914 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm4[4,5,4,5]
18915 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0
18916 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm0
18917 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18918 ; AVX512-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm2
18919 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18920 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18921 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18922 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
18923 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5]
18924 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2
18925 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm2
18926 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18927 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
18928 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18929 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18930 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18931 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18932 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5]
18933 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2
18934 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm2
18935 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18936 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
18937 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18938 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18939 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18940 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
18941 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm0[4,5,4,5],zmm2[4,5,4,5]
18942 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
18943 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm26
18944 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18945 ; AVX512-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm26
18946 ; AVX512-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm15
18947 ; AVX512-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm0
18948 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18949 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm0
18950 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
18951 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5]
18952 ; AVX512-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1
18953 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18954 ; AVX512-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13]
18955 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18956 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
18957 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18958 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
18959 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18960 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
18961 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18962 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
18963 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
18964 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload
18965 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
18966 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload
18967 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
18968 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload
18969 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18970 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
18971 ; AVX512-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload
18972 ; AVX512-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18973 ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
18974 ; AVX512-FCP-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7]
18975 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0
18976 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18977 ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
18978 ; AVX512-FCP-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7]
18979 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0
18980 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18981 ; AVX512-FCP-NEXT: vpblendd $240, (%rsp), %ymm3, %ymm0 # 32-byte Folded Reload
18982 ; AVX512-FCP-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7]
18983 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
18984 ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
18985 ; AVX512-FCP-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7]
18986 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
18987 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
18988 ; AVX512-FCP-NEXT: movb $-32, %al
18989 ; AVX512-FCP-NEXT: kmovw %eax, %k1
18990 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18991 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
18992 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18993 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18994 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
18995 ; AVX512-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18996 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
18997 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1}
18998 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18999 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19000 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1}
19001 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19002 ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 {%k1}
19003 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19004 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19005 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
19006 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19007 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1}
19008 ; AVX512-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19009 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1}
19010 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19011 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19012 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
19013 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 {%k1}
19014 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19015 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19016 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19017 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
19018 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19019 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
19020 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19021 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19022 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19023 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
19024 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19025 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
19026 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19027 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19028 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19029 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
19030 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19031 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
19032 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19033 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19034 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19035 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
19036 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19037 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
19038 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19039 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
19040 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19041 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2}
19042 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19043 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1}
19044 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19045 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19046 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2}
19047 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19048 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
19049 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19050 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19051 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
19052 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 {%k1}
19053 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm18
19054 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
19055 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1}
19056 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
19057 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19058 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
19059 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
19060 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19061 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
19062 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
19063 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19064 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
19065 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
19066 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19067 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
19068 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
19069 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19070 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
19071 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
19072 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19073 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1}
19074 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
19075 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19076 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
19077 ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
19078 ; AVX512-FCP-NEXT: # ymm0 = ymm11[0,1,2,3],mem[4,5,6,7]
19079 ; AVX512-FCP-NEXT: vmovdqa 2752(%rdi), %ymm3
19080 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
19081 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
19082 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm26, %zmm3
19083 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19084 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
19085 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19086 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
19087 ; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %ymm1
19088 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
19089 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
19090 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm6
19091 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19092 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1}
19093 ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload
19094 ; AVX512-FCP-NEXT: # ymm1 = ymm13[0,1,2,3],mem[4,5,6,7]
19095 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm4
19096 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
19097 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
19098 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm29, %zmm14
19099 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
19100 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1}
19101 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
19102 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19103 ; AVX512-FCP-NEXT: vmovdqa 1408(%rdi), %ymm4
19104 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
19105 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
19106 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm17, %zmm5
19107 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
19108 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
19109 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
19110 ; AVX512-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
19111 ; AVX512-FCP-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7]
19112 ; AVX512-FCP-NEXT: vmovdqa 960(%rdi), %ymm8
19113 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
19114 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
19115 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
19116 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm9, %zmm8
19117 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
19118 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1}
19119 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
19120 ; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm4, %zmm9, %zmm4
19121 ; AVX512-FCP-NEXT: vmovdqa 2304(%rdi), %ymm10
19122 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
19123 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10
19124 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
19125 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm9, %zmm10
19126 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
19127 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
19128 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
19129 ; AVX512-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload
19130 ; AVX512-FCP-NEXT: # ymm11 = mem[0,1,2,3],ymm9[4,5,6,7]
19131 ; AVX512-FCP-NEXT: vmovdqa 1856(%rdi), %ymm12
19132 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
19133 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
19134 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
19135 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm9, %zmm12
19136 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
19137 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1}
19138 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
19139 ; AVX512-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm9, %zmm11
19140 ; AVX512-FCP-NEXT: vmovdqa 3200(%rdi), %ymm13
19141 ; AVX512-FCP-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
19142 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13
19143 ; AVX512-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm15, %zmm9
19144 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1}
19145 ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 448(%rsi)
19146 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
19147 ; AVX512-FCP-NEXT: vmovaps %zmm13, 384(%rsi)
19148 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
19149 ; AVX512-FCP-NEXT: vmovaps %zmm13, 320(%rsi)
19150 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
19151 ; AVX512-FCP-NEXT: vmovaps %zmm13, 256(%rsi)
19152 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
19153 ; AVX512-FCP-NEXT: vmovaps %zmm13, 192(%rsi)
19154 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
19155 ; AVX512-FCP-NEXT: vmovaps %zmm13, 128(%rsi)
19156 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
19157 ; AVX512-FCP-NEXT: vmovaps %zmm13, 64(%rsi)
19158 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
19159 ; AVX512-FCP-NEXT: vmovaps %zmm13, (%rsi)
19160 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 448(%rdx)
19161 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 256(%rdx)
19162 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, 320(%rdx)
19163 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19164 ; AVX512-FCP-NEXT: vmovaps %zmm2, 128(%rdx)
19165 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19166 ; AVX512-FCP-NEXT: vmovaps %zmm2, 192(%rdx)
19167 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19168 ; AVX512-FCP-NEXT: vmovaps %zmm2, (%rdx)
19169 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19170 ; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rdx)
19171 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19172 ; AVX512-FCP-NEXT: vmovaps %zmm2, 384(%rdx)
19173 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, 448(%rcx)
19174 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 256(%rcx)
19175 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, 320(%rcx)
19176 ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 128(%rcx)
19177 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 192(%rcx)
19178 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
19179 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%rcx)
19180 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, 384(%rcx)
19181 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 448(%r8)
19182 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 256(%r8)
19183 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 320(%r8)
19184 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 128(%r8)
19185 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%r8)
19186 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, (%r8)
19187 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8)
19188 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 384(%r8)
19189 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19190 ; AVX512-FCP-NEXT: vmovaps %zmm3, 448(%r9)
19191 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19192 ; AVX512-FCP-NEXT: vmovaps %zmm3, 256(%r9)
19193 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19194 ; AVX512-FCP-NEXT: vmovaps %zmm3, 320(%r9)
19195 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19196 ; AVX512-FCP-NEXT: vmovaps %zmm3, 128(%r9)
19197 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19198 ; AVX512-FCP-NEXT: vmovaps %zmm3, 192(%r9)
19199 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19200 ; AVX512-FCP-NEXT: vmovaps %zmm3, (%r9)
19201 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19202 ; AVX512-FCP-NEXT: vmovaps %zmm3, 64(%r9)
19203 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19204 ; AVX512-FCP-NEXT: vmovaps %zmm3, 384(%r9)
19205 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
19206 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19207 ; AVX512-FCP-NEXT: vmovaps %zmm2, 448(%rax)
19208 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19209 ; AVX512-FCP-NEXT: vmovaps %zmm2, 256(%rax)
19210 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19211 ; AVX512-FCP-NEXT: vmovaps %zmm2, 320(%rax)
19212 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19213 ; AVX512-FCP-NEXT: vmovaps %zmm2, 128(%rax)
19214 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19215 ; AVX512-FCP-NEXT: vmovaps %zmm2, 192(%rax)
19216 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19217 ; AVX512-FCP-NEXT: vmovaps %zmm2, (%rax)
19218 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19219 ; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rax)
19220 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19221 ; AVX512-FCP-NEXT: vmovaps %zmm3, 384(%rax)
19222 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
19223 ; AVX512-FCP-NEXT: vmovaps %zmm11, 384(%rax)
19224 ; AVX512-FCP-NEXT: vmovaps %zmm4, 448(%rax)
19225 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 256(%rax)
19226 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 320(%rax)
19227 ; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
19228 ; AVX512-FCP-NEXT: vmovaps %zmm0, 128(%rax)
19229 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 192(%rax)
19230 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19231 ; AVX512-FCP-NEXT: vmovaps %zmm0, (%rax)
19232 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19233 ; AVX512-FCP-NEXT: vmovaps %zmm0, 64(%rax)
19234 ; AVX512-FCP-NEXT: addq $7624, %rsp # imm = 0x1DC8
19235 ; AVX512-FCP-NEXT: vzeroupper
19236 ; AVX512-FCP-NEXT: retq
19238 ; AVX512DQ-LABEL: load_i64_stride7_vf64:
19239 ; AVX512DQ: # %bb.0:
19240 ; AVX512DQ-NEXT: subq $7624, %rsp # imm = 0x1DC8
19241 ; AVX512DQ-NEXT: vmovdqa64 3328(%rdi), %zmm16
19242 ; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19243 ; AVX512DQ-NEXT: vmovdqa64 3264(%rdi), %zmm8
19244 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19245 ; AVX512DQ-NEXT: vmovdqa64 3008(%rdi), %zmm19
19246 ; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19247 ; AVX512DQ-NEXT: vmovdqa64 2944(%rdi), %zmm20
19248 ; AVX512DQ-NEXT: vmovdqa64 2880(%rdi), %zmm2
19249 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19250 ; AVX512DQ-NEXT: vmovdqa64 2816(%rdi), %zmm1
19251 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19252 ; AVX512DQ-NEXT: vmovdqa64 2752(%rdi), %zmm18
19253 ; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19254 ; AVX512DQ-NEXT: vmovdqa64 2688(%rdi), %zmm7
19255 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19256 ; AVX512DQ-NEXT: vmovdqa64 2432(%rdi), %zmm17
19257 ; AVX512DQ-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19258 ; AVX512DQ-NEXT: vmovdqa64 2368(%rdi), %zmm9
19259 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19260 ; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm11
19261 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19262 ; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm3
19263 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19264 ; AVX512DQ-NEXT: vmovdqa64 1536(%rdi), %zmm12
19265 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19266 ; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm13
19267 ; AVX512DQ-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
19268 ; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm14
19269 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19270 ; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm4
19271 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19272 ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm10
19273 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19274 ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm6
19275 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19276 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm15
19277 ; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19278 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm5
19279 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19280 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
19281 ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
19282 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
19283 ; AVX512DQ-NEXT: vmovdqa 2704(%rdi), %xmm2
19284 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
19285 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
19286 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19287 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm0, %zmm6
19288 ; AVX512DQ-NEXT: vmovdqa 464(%rdi), %xmm2
19289 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
19290 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1
19291 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19292 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1
19293 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm1
19294 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm2
19295 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
19296 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
19297 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19298 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1
19299 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm1
19300 ; AVX512DQ-NEXT: vmovdqa 1360(%rdi), %xmm2
19301 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
19302 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
19303 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19304 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1
19305 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm1
19306 ; AVX512DQ-NEXT: vmovdqa 912(%rdi), %xmm2
19307 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
19308 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
19309 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19310 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1
19311 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
19312 ; AVX512DQ-NEXT: vmovdqa 2256(%rdi), %xmm2
19313 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
19314 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
19315 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19316 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1
19317 ; AVX512DQ-NEXT: vpermt2q %zmm11, %zmm0, %zmm1
19318 ; AVX512DQ-NEXT: vmovdqa 1808(%rdi), %xmm2
19319 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
19320 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
19321 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19322 ; AVX512DQ-NEXT: vpermi2q %zmm16, %zmm8, %zmm0
19323 ; AVX512DQ-NEXT: vmovdqa 3152(%rdi), %xmm1
19324 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
19325 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
19326 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19327 ; AVX512DQ-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1
19328 ; AVX512DQ-NEXT: vmovdqa 2816(%rdi), %ymm0
19329 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
19330 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
19331 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
19332 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm9, %zmm7
19333 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
19334 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0]
19335 ; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
19336 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm12
19337 ; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19338 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm2
19339 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm11, %zmm2
19340 ; AVX512DQ-NEXT: vmovdqa64 3072(%rdi), %zmm3
19341 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19342 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
19343 ; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
19344 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm4, %zmm2
19345 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
19346 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19347 ; AVX512DQ-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2
19348 ; AVX512DQ-NEXT: vmovdqa 576(%rdi), %ymm0
19349 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
19350 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
19351 ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm16
19352 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm3
19353 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19354 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm9, %zmm3
19355 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
19356 ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm31
19357 ; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm3
19358 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19359 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm11, %zmm3
19360 ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm5
19361 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19362 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm4, %zmm3
19363 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
19364 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19365 ; AVX512DQ-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm3
19366 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm2
19367 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
19368 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7]
19369 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm23
19370 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm2
19371 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5
19372 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm9, %zmm5
19373 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
19374 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm13
19375 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm6
19376 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm5
19377 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm24
19378 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19379 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm11, %zmm5
19380 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm6
19381 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19382 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm4, %zmm5
19383 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
19384 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19385 ; AVX512DQ-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm5
19386 ; AVX512DQ-NEXT: vmovdqa 1472(%rdi), %ymm3
19387 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
19388 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7]
19389 ; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm19
19390 ; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm7
19391 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm6
19392 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm9, %zmm6
19393 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7]
19394 ; AVX512DQ-NEXT: vmovdqa64 1664(%rdi), %zmm17
19395 ; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm8
19396 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19397 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
19398 ; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm3
19399 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19400 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm4, %zmm8
19401 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm5
19402 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19403 ; AVX512DQ-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm6
19404 ; AVX512DQ-NEXT: vmovdqa 1024(%rdi), %ymm3
19405 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
19406 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7]
19407 ; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm30
19408 ; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm1
19409 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8
19410 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19411 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm9, %zmm8
19412 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7]
19413 ; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm29
19414 ; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm10
19415 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19416 ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm11, %zmm10
19417 ; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm3
19418 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19419 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm4, %zmm10
19420 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm5
19421 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19422 ; AVX512DQ-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm8
19423 ; AVX512DQ-NEXT: vmovdqa 2368(%rdi), %ymm3
19424 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
19425 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm8[6,7]
19426 ; AVX512DQ-NEXT: vmovdqa64 2304(%rdi), %zmm18
19427 ; AVX512DQ-NEXT: vmovdqa64 2240(%rdi), %zmm0
19428 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10
19429 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19430 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm9, %zmm10
19431 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7]
19432 ; AVX512DQ-NEXT: vmovdqa64 2560(%rdi), %zmm20
19433 ; AVX512DQ-NEXT: vmovdqa64 2496(%rdi), %zmm14
19434 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19435 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm11, %zmm14
19436 ; AVX512DQ-NEXT: vmovdqa64 2624(%rdi), %zmm21
19437 ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm4, %zmm14
19438 ; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19439 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm5
19440 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19441 ; AVX512DQ-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm10
19442 ; AVX512DQ-NEXT: vmovdqa 1920(%rdi), %ymm3
19443 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
19444 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7]
19445 ; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm22
19446 ; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm5
19447 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm15
19448 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm6
19449 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19450 ; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm9, %zmm15
19451 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
19452 ; AVX512DQ-NEXT: vmovdqa64 2112(%rdi), %zmm26
19453 ; AVX512DQ-NEXT: vmovdqa64 2048(%rdi), %zmm15
19454 ; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19455 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm11, %zmm15
19456 ; AVX512DQ-NEXT: vmovdqa64 2176(%rdi), %zmm28
19457 ; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm4, %zmm15
19458 ; AVX512DQ-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19459 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm5
19460 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19461 ; AVX512DQ-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm10
19462 ; AVX512DQ-NEXT: vmovdqa 3264(%rdi), %ymm3
19463 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
19464 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7]
19465 ; AVX512DQ-NEXT: vmovdqa64 3200(%rdi), %zmm25
19466 ; AVX512DQ-NEXT: vmovdqa64 3136(%rdi), %zmm14
19467 ; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm14, %zmm9
19468 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7]
19469 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5
19470 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm11, %zmm5
19471 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19472 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19473 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm5
19474 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm27
19475 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm11, %zmm5
19476 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19477 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm11, %zmm1
19478 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19479 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm3
19480 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm5
19481 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm11, %zmm5
19482 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19483 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm5
19484 ; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm11, %zmm5
19485 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19486 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm11, %zmm0
19487 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19488 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
19489 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm9
19490 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
19491 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm11, %zmm9
19492 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19493 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm9
19494 ; AVX512DQ-NEXT: vpermt2q %zmm25, %zmm11, %zmm9
19495 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19496 ; AVX512DQ-NEXT: vmovdqa64 3456(%rdi), %zmm9
19497 ; AVX512DQ-NEXT: vmovdqa64 3392(%rdi), %zmm0
19498 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19499 ; AVX512DQ-NEXT: vpermi2q %zmm9, %zmm0, %zmm11
19500 ; AVX512DQ-NEXT: vmovdqa64 3520(%rdi), %zmm6
19501 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19502 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm4, %zmm11
19503 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm4
19504 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19505 ; AVX512DQ-NEXT: vmovdqa 2880(%rdi), %ymm4
19506 ; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
19507 ; AVX512DQ-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
19508 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,12]
19509 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm4
19510 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm11, %zmm4
19511 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7]
19512 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [9,0,7,0,9,0,7,0]
19513 ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
19514 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
19515 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm16
19516 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
19517 ; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
19518 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm16 # 64-byte Folded Reload
19519 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm5
19520 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19521 ; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm10
19522 ; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload
19523 ; AVX512DQ-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
19524 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10
19525 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm6
19526 ; AVX512DQ-NEXT: vpermt2q %zmm27, %zmm11, %zmm10
19527 ; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19528 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7]
19529 ; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm10
19530 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
19531 ; AVX512DQ-NEXT: vpermt2q %zmm16, %zmm0, %zmm10
19532 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload
19533 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1
19534 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19535 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm1
19536 ; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
19537 ; AVX512DQ-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
19538 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5
19539 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19540 ; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19541 ; AVX512DQ-NEXT: vpermt2q %zmm23, %zmm11, %zmm2
19542 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19543 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm2
19544 ; AVX512DQ-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
19545 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
19546 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
19547 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19548 ; AVX512DQ-NEXT: vmovdqa 1536(%rdi), %ymm1
19549 ; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
19550 ; AVX512DQ-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
19551 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm8
19552 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19553 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2
19554 ; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19555 ; AVX512DQ-NEXT: vpermt2q %zmm19, %zmm11, %zmm2
19556 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19557 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm2
19558 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
19559 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
19560 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
19561 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19562 ; AVX512DQ-NEXT: vmovdqa 1088(%rdi), %ymm1
19563 ; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
19564 ; AVX512DQ-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
19565 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
19566 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm2
19567 ; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19568 ; AVX512DQ-NEXT: vpermt2q %zmm30, %zmm11, %zmm2
19569 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19570 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm2
19571 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
19572 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
19573 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
19574 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19575 ; AVX512DQ-NEXT: vmovdqa 2432(%rdi), %ymm1
19576 ; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
19577 ; AVX512DQ-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
19578 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19579 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm2
19580 ; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19581 ; AVX512DQ-NEXT: vpermt2q %zmm18, %zmm11, %zmm2
19582 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19583 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm2
19584 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
19585 ; AVX512DQ-NEXT: vpermt2q %zmm21, %zmm4, %zmm2
19586 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
19587 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19588 ; AVX512DQ-NEXT: vmovdqa 1984(%rdi), %ymm1
19589 ; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
19590 ; AVX512DQ-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
19591 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19592 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm2
19593 ; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19594 ; AVX512DQ-NEXT: vpermt2q %zmm22, %zmm11, %zmm2
19595 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
19596 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm2
19597 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
19598 ; AVX512DQ-NEXT: vpermt2q %zmm28, %zmm4, %zmm2
19599 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
19600 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19601 ; AVX512DQ-NEXT: vmovdqa 3328(%rdi), %ymm1
19602 ; AVX512DQ-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
19603 ; AVX512DQ-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
19604 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19605 ; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19606 ; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm14, %zmm11
19607 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
19608 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm0, %zmm7
19609 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19610 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm2
19611 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm0, %zmm2
19612 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19613 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm2
19614 ; AVX512DQ-NEXT: vpermt2q %zmm5, %zmm0, %zmm2
19615 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19616 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm2
19617 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm0, %zmm2
19618 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19619 ; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm2
19620 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm0, %zmm2
19621 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19622 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm0, %zmm18
19623 ; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19624 ; AVX512DQ-NEXT: vpermt2q %zmm12, %zmm0, %zmm22
19625 ; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19626 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm0, %zmm25
19627 ; AVX512DQ-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19628 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19629 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm25
19630 ; AVX512DQ-NEXT: vpermi2q %zmm6, %zmm9, %zmm0
19631 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload
19632 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
19633 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19634 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3]
19635 ; AVX512DQ-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19636 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm2
19637 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm12
19638 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm23, %zmm12
19639 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [11,4,11,4,11,4,11,4]
19640 ; AVX512DQ-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19641 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0
19642 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm24, %zmm0
19643 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19644 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [12,5,12,5,12,5,12,5]
19645 ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19646 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm0
19647 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm8, %zmm0
19648 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19649 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,6,13,6,13,6,13,6]
19650 ; AVX512DQ-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19651 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm0
19652 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm16, %zmm0
19653 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19654 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
19655 ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19656 ; AVX512DQ-NEXT: vpermt2q %zmm31, %zmm1, %zmm2
19657 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19658 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
19659 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0
19660 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm23, %zmm0
19661 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10
19662 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0
19663 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm24, %zmm0
19664 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19665 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0
19666 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm8, %zmm0
19667 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19668 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm0
19669 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm16, %zmm0
19670 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19671 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm1, %zmm28
19672 ; AVX512DQ-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19673 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
19674 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0
19675 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm23, %zmm0
19676 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm14
19677 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0
19678 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm24, %zmm0
19679 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm18
19680 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0
19681 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm8, %zmm0
19682 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19683 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm0
19684 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm16, %zmm0
19685 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19686 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm1, %zmm22
19687 ; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19688 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
19689 ; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm9
19690 ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm23, %zmm9
19691 ; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm0
19692 ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm24, %zmm0
19693 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm19
19694 ; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm0
19695 ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm8, %zmm0
19696 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19697 ; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm0
19698 ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm16, %zmm0
19699 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19700 ; AVX512DQ-NEXT: vpermt2q %zmm29, %zmm1, %zmm30
19701 ; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19702 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19703 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7
19704 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm23, %zmm7
19705 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3
19706 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm24, %zmm3
19707 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm21
19708 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3
19709 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm8, %zmm3
19710 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19711 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm3
19712 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm16, %zmm3
19713 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19714 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm0
19715 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19716 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
19717 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm30
19718 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm23, %zmm30
19719 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0
19720 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm24, %zmm0
19721 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm22
19722 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm5
19723 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm8, %zmm5
19724 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm0
19725 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm16, %zmm0
19726 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm29
19727 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm1, %zmm11
19728 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
19729 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm28
19730 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
19731 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm23, %zmm28
19732 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm31
19733 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm3
19734 ; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm6, %zmm23
19735 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19736 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
19737 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19738 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm27
19739 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm24, %zmm27
19740 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19741 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
19742 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm16, %zmm0
19743 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19744 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19745 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
19746 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm16, %zmm0
19747 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19748 ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
19749 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
19750 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm16, %zmm0
19751 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19752 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19753 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
19754 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm16, %zmm0
19755 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19756 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19757 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
19758 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19759 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19760 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
19761 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19762 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19763 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
19764 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19765 ; AVX512DQ-NEXT: vpermi2q %zmm25, %zmm31, %zmm24
19766 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm25
19767 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2
19768 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm8, %zmm25
19769 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm31, %zmm2
19770 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm0
19771 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm16, %zmm0
19772 ; AVX512DQ-NEXT: vpermi2q %zmm3, %zmm31, %zmm16
19773 ; AVX512DQ-NEXT: vpermt2q %zmm3, %zmm1, %zmm31
19774 ; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm3
19775 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm15
19776 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm31
19777 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
19778 ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
19779 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19780 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm1, %zmm10
19781 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19782 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19783 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
19784 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19785 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
19786 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm9
19787 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19788 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
19789 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm14
19790 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19791 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
19792 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm1, %zmm30
19793 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
19794 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm1, %zmm7
19795 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19796 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
19797 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm28
19798 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
19799 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm23
19800 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
19801 ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
19802 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm27
19803 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19804 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
19805 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19806 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19807 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm1, %zmm12
19808 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19809 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm18
19810 ; AVX512DQ-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19811 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm19
19812 ; AVX512DQ-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19813 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm1, %zmm21
19814 ; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19815 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm1, %zmm22
19816 ; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19817 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm24
19818 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
19819 ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
19820 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm25
19821 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19822 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
19823 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19824 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19825 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm1, %zmm12
19826 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19827 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19828 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm12
19829 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19830 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19831 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm12
19832 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19833 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19834 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm1, %zmm12
19835 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19836 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm1, %zmm5
19837 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19838 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm2
19839 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19840 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
19841 ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
19842 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm0
19843 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19844 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19845 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
19846 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19847 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19848 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm1, %zmm12
19849 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19850 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19851 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm12
19852 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19853 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19854 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm12
19855 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19856 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19857 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm1, %zmm12
19858 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19859 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm1, %zmm29
19860 ; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19861 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm16
19862 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
19863 ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
19864 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
19865 ; AVX512DQ-NEXT: vpermt2q %zmm6, %zmm1, %zmm18
19866 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
19867 ; AVX512DQ-NEXT: vpermt2q %zmm8, %zmm1, %zmm19
19868 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
19869 ; AVX512DQ-NEXT: vpermt2q %zmm9, %zmm1, %zmm21
19870 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
19871 ; AVX512DQ-NEXT: vpermt2q %zmm10, %zmm1, %zmm22
19872 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19873 ; AVX512DQ-NEXT: vpermt2q %zmm14, %zmm1, %zmm2
19874 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19875 ; AVX512DQ-NEXT: vpermt2q %zmm15, %zmm1, %zmm11
19876 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19877 ; AVX512DQ-NEXT: vpermt2q %zmm20, %zmm1, %zmm3
19878 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19879 ; AVX512DQ-NEXT: vpermt2q %zmm7, %zmm1, %zmm31
19880 ; AVX512DQ-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19881 ; AVX512DQ-NEXT: movb $24, %al
19882 ; AVX512DQ-NEXT: kmovw %eax, %k2
19883 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19884 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19885 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[4,5,4,5],zmm13[4,5,4,5]
19886 ; AVX512DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0]
19887 ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
19888 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm20
19889 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm20
19890 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,11]
19891 ; AVX512DQ-NEXT: vpermt2q %zmm13, %zmm3, %zmm0
19892 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19893 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19894 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
19895 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm0[4,5,4,5],zmm17[4,5,4,5]
19896 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm29
19897 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm29
19898 ; AVX512DQ-NEXT: vpermt2q %zmm17, %zmm3, %zmm0
19899 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19900 ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
19901 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
19902 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm0[4,5,4,5],zmm26[4,5,4,5]
19903 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm17
19904 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm17
19905 ; AVX512DQ-NEXT: vpermt2q %zmm26, %zmm3, %zmm0
19906 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
19907 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19908 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19909 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm4[4,5,4,5]
19910 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0
19911 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm15, %zmm0
19912 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19913 ; AVX512DQ-NEXT: vpermt2q %zmm4, %zmm3, %zmm2
19914 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19915 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19916 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19917 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
19918 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5]
19919 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2
19920 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm2
19921 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19922 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
19923 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19924 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19925 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19926 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19927 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5]
19928 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2
19929 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm15, %zmm2
19930 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19931 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
19932 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19933 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19934 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19935 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
19936 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm0[4,5,4,5],zmm2[4,5,4,5]
19937 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
19938 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm26
19939 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19940 ; AVX512DQ-NEXT: vpermt2q %zmm1, %zmm15, %zmm26
19941 ; AVX512DQ-NEXT: vpermi2q %zmm0, %zmm2, %zmm15
19942 ; AVX512DQ-NEXT: vpermt2q %zmm2, %zmm3, %zmm0
19943 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19944 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm0
19945 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
19946 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5]
19947 ; AVX512DQ-NEXT: vpermt2q %zmm0, %zmm3, %zmm1
19948 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19949 ; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13]
19950 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19951 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
19952 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19953 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
19954 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19955 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
19956 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19957 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
19958 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
19959 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload
19960 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
19961 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload
19962 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
19963 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload
19964 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19965 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
19966 ; AVX512DQ-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload
19967 ; AVX512DQ-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19968 ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
19969 ; AVX512DQ-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7]
19970 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0
19971 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19972 ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
19973 ; AVX512DQ-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7]
19974 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0
19975 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19976 ; AVX512DQ-NEXT: vpblendd $240, (%rsp), %ymm3, %ymm0 # 32-byte Folded Reload
19977 ; AVX512DQ-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7]
19978 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
19979 ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
19980 ; AVX512DQ-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7]
19981 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
19982 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
19983 ; AVX512DQ-NEXT: movb $-32, %al
19984 ; AVX512DQ-NEXT: kmovw %eax, %k1
19985 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19986 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
19987 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19988 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19989 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
19990 ; AVX512DQ-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19991 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19992 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1}
19993 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19994 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
19995 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1}
19996 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19997 ; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm6 {%k1}
19998 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19999 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20000 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
20001 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20002 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1}
20003 ; AVX512DQ-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20004 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1}
20005 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20006 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20007 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
20008 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm0 {%k1}
20009 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20010 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20011 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20012 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
20013 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20014 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
20015 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20016 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20017 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20018 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
20019 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20020 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
20021 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20022 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20023 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20024 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
20025 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20026 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
20027 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20028 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20029 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20030 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
20031 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20032 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
20033 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20034 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
20035 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20036 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2}
20037 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20038 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1}
20039 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20040 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20041 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2}
20042 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20043 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
20044 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20045 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20046 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
20047 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm0 {%k1}
20048 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm18
20049 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
20050 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1}
20051 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
20052 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20053 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
20054 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
20055 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20056 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
20057 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
20058 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20059 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
20060 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
20061 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20062 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
20063 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
20064 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20065 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
20066 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
20067 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20068 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1}
20069 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
20070 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20071 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
20072 ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
20073 ; AVX512DQ-NEXT: # ymm0 = ymm11[0,1,2,3],mem[4,5,6,7]
20074 ; AVX512DQ-NEXT: vmovdqa 2752(%rdi), %ymm3
20075 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
20076 ; AVX512DQ-NEXT: vextracti128 $1, %ymm3, %xmm3
20077 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm3, %zmm26, %zmm3
20078 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20079 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
20080 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20081 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
20082 ; AVX512DQ-NEXT: vmovdqa 512(%rdi), %ymm1
20083 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
20084 ; AVX512DQ-NEXT: vextracti128 $1, %ymm1, %xmm1
20085 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm6
20086 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20087 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1}
20088 ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload
20089 ; AVX512DQ-NEXT: # ymm1 = ymm13[0,1,2,3],mem[4,5,6,7]
20090 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm4
20091 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
20092 ; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm4
20093 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm4, %zmm29, %zmm14
20094 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
20095 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1}
20096 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
20097 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20098 ; AVX512DQ-NEXT: vmovdqa 1408(%rdi), %ymm4
20099 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
20100 ; AVX512DQ-NEXT: vextracti128 $1, %ymm4, %xmm4
20101 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm4, %zmm17, %zmm5
20102 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
20103 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
20104 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
20105 ; AVX512DQ-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
20106 ; AVX512DQ-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7]
20107 ; AVX512DQ-NEXT: vmovdqa 960(%rdi), %ymm8
20108 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
20109 ; AVX512DQ-NEXT: vextracti128 $1, %ymm8, %xmm8
20110 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
20111 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm8, %zmm9, %zmm8
20112 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
20113 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1}
20114 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
20115 ; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm4, %zmm9, %zmm4
20116 ; AVX512DQ-NEXT: vmovdqa 2304(%rdi), %ymm10
20117 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
20118 ; AVX512DQ-NEXT: vextracti128 $1, %ymm10, %xmm10
20119 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
20120 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm10, %zmm9, %zmm10
20121 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
20122 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
20123 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
20124 ; AVX512DQ-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload
20125 ; AVX512DQ-NEXT: # ymm11 = mem[0,1,2,3],ymm9[4,5,6,7]
20126 ; AVX512DQ-NEXT: vmovdqa 1856(%rdi), %ymm12
20127 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
20128 ; AVX512DQ-NEXT: vextracti128 $1, %ymm12, %xmm12
20129 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
20130 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm12, %zmm9, %zmm12
20131 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
20132 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1}
20133 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
20134 ; AVX512DQ-NEXT: vinsertf64x4 $0, %ymm11, %zmm9, %zmm11
20135 ; AVX512DQ-NEXT: vmovdqa 3200(%rdi), %ymm13
20136 ; AVX512DQ-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
20137 ; AVX512DQ-NEXT: vextracti128 $1, %ymm13, %xmm13
20138 ; AVX512DQ-NEXT: vinserti32x4 $0, %xmm13, %zmm15, %zmm9
20139 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1}
20140 ; AVX512DQ-NEXT: vmovdqa64 %zmm31, 448(%rsi)
20141 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
20142 ; AVX512DQ-NEXT: vmovaps %zmm13, 384(%rsi)
20143 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
20144 ; AVX512DQ-NEXT: vmovaps %zmm13, 320(%rsi)
20145 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
20146 ; AVX512DQ-NEXT: vmovaps %zmm13, 256(%rsi)
20147 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
20148 ; AVX512DQ-NEXT: vmovaps %zmm13, 192(%rsi)
20149 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
20150 ; AVX512DQ-NEXT: vmovaps %zmm13, 128(%rsi)
20151 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
20152 ; AVX512DQ-NEXT: vmovaps %zmm13, 64(%rsi)
20153 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
20154 ; AVX512DQ-NEXT: vmovaps %zmm13, (%rsi)
20155 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 448(%rdx)
20156 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 256(%rdx)
20157 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, 320(%rdx)
20158 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20159 ; AVX512DQ-NEXT: vmovaps %zmm2, 128(%rdx)
20160 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20161 ; AVX512DQ-NEXT: vmovaps %zmm2, 192(%rdx)
20162 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20163 ; AVX512DQ-NEXT: vmovaps %zmm2, (%rdx)
20164 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20165 ; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rdx)
20166 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20167 ; AVX512DQ-NEXT: vmovaps %zmm2, 384(%rdx)
20168 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, 448(%rcx)
20169 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, 256(%rcx)
20170 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, 320(%rcx)
20171 ; AVX512DQ-NEXT: vmovdqa64 %zmm30, 128(%rcx)
20172 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, 192(%rcx)
20173 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rcx)
20174 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%rcx)
20175 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rcx)
20176 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 448(%r8)
20177 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, 256(%r8)
20178 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, 320(%r8)
20179 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 128(%r8)
20180 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 192(%r8)
20181 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, (%r8)
20182 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 64(%r8)
20183 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 384(%r8)
20184 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20185 ; AVX512DQ-NEXT: vmovaps %zmm3, 448(%r9)
20186 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20187 ; AVX512DQ-NEXT: vmovaps %zmm3, 256(%r9)
20188 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20189 ; AVX512DQ-NEXT: vmovaps %zmm3, 320(%r9)
20190 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20191 ; AVX512DQ-NEXT: vmovaps %zmm3, 128(%r9)
20192 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20193 ; AVX512DQ-NEXT: vmovaps %zmm3, 192(%r9)
20194 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20195 ; AVX512DQ-NEXT: vmovaps %zmm3, (%r9)
20196 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20197 ; AVX512DQ-NEXT: vmovaps %zmm3, 64(%r9)
20198 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20199 ; AVX512DQ-NEXT: vmovaps %zmm3, 384(%r9)
20200 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
20201 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20202 ; AVX512DQ-NEXT: vmovaps %zmm2, 448(%rax)
20203 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20204 ; AVX512DQ-NEXT: vmovaps %zmm2, 256(%rax)
20205 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20206 ; AVX512DQ-NEXT: vmovaps %zmm2, 320(%rax)
20207 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20208 ; AVX512DQ-NEXT: vmovaps %zmm2, 128(%rax)
20209 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20210 ; AVX512DQ-NEXT: vmovaps %zmm2, 192(%rax)
20211 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20212 ; AVX512DQ-NEXT: vmovaps %zmm2, (%rax)
20213 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20214 ; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rax)
20215 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20216 ; AVX512DQ-NEXT: vmovaps %zmm3, 384(%rax)
20217 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
20218 ; AVX512DQ-NEXT: vmovaps %zmm11, 384(%rax)
20219 ; AVX512DQ-NEXT: vmovaps %zmm4, 448(%rax)
20220 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 256(%rax)
20221 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 320(%rax)
20222 ; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
20223 ; AVX512DQ-NEXT: vmovaps %zmm0, 128(%rax)
20224 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, 192(%rax)
20225 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20226 ; AVX512DQ-NEXT: vmovaps %zmm0, (%rax)
20227 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20228 ; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rax)
20229 ; AVX512DQ-NEXT: addq $7624, %rsp # imm = 0x1DC8
20230 ; AVX512DQ-NEXT: vzeroupper
20231 ; AVX512DQ-NEXT: retq
20233 ; AVX512DQ-FCP-LABEL: load_i64_stride7_vf64:
20234 ; AVX512DQ-FCP: # %bb.0:
20235 ; AVX512DQ-FCP-NEXT: subq $7624, %rsp # imm = 0x1DC8
20236 ; AVX512DQ-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm16
20237 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20238 ; AVX512DQ-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm8
20239 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20240 ; AVX512DQ-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm19
20241 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20242 ; AVX512DQ-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm20
20243 ; AVX512DQ-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2
20244 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20245 ; AVX512DQ-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1
20246 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20247 ; AVX512DQ-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm18
20248 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20249 ; AVX512DQ-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm7
20250 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20251 ; AVX512DQ-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm17
20252 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20253 ; AVX512DQ-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm9
20254 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20255 ; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm11
20256 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20257 ; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3
20258 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20259 ; AVX512DQ-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm12
20260 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20261 ; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm13
20262 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm13, (%rsp) # 64-byte Spill
20263 ; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14
20264 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20265 ; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm4
20266 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20267 ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm10
20268 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20269 ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm6
20270 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20271 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm15
20272 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20273 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm5
20274 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20275 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
20276 ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
20277 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
20278 ; AVX512DQ-FCP-NEXT: vmovdqa 2704(%rdi), %xmm2
20279 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
20280 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
20281 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20282 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm6
20283 ; AVX512DQ-FCP-NEXT: vmovdqa 464(%rdi), %xmm2
20284 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
20285 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm6, %zmm1
20286 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20287 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
20288 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm1
20289 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
20290 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
20291 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
20292 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20293 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1
20294 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm1
20295 ; AVX512DQ-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2
20296 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
20297 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
20298 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20299 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1
20300 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm1
20301 ; AVX512DQ-FCP-NEXT: vmovdqa 912(%rdi), %xmm2
20302 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
20303 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
20304 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20305 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1
20306 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
20307 ; AVX512DQ-FCP-NEXT: vmovdqa 2256(%rdi), %xmm2
20308 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
20309 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
20310 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20311 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
20312 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1
20313 ; AVX512DQ-FCP-NEXT: vmovdqa 1808(%rdi), %xmm2
20314 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
20315 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
20316 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20317 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm16, %zmm8, %zmm0
20318 ; AVX512DQ-FCP-NEXT: vmovdqa 3152(%rdi), %xmm1
20319 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
20320 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
20321 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20322 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm1
20323 ; AVX512DQ-FCP-NEXT: vmovdqa 2816(%rdi), %ymm0
20324 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
20325 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
20326 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
20327 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm9, %zmm7
20328 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
20329 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = [0,7,14,0,0,7,14,0]
20330 ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3]
20331 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm12
20332 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20333 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm2
20334 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm2
20335 ; AVX512DQ-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm3
20336 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20337 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
20338 ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
20339 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm2
20340 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
20341 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20342 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm2
20343 ; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %ymm0
20344 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
20345 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm2[6,7]
20346 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm16
20347 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm3
20348 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20349 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm9, %zmm3
20350 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
20351 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm31
20352 ; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm3
20353 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20354 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm11, %zmm3
20355 ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm5
20356 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20357 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm3
20358 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
20359 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20360 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm3
20361 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm2
20362 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
20363 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5],ymm3[6,7]
20364 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm23
20365 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
20366 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
20367 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm9, %zmm5
20368 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3],ymm3[4,5,6,7]
20369 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm13
20370 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm6
20371 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm5
20372 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm24
20373 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20374 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm11, %zmm5
20375 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6
20376 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20377 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm5
20378 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm5, %zmm3
20379 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20380 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm5
20381 ; AVX512DQ-FCP-NEXT: vmovdqa 1472(%rdi), %ymm3
20382 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
20383 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm3[0,1,2,3,4,5],ymm5[6,7]
20384 ; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm19
20385 ; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm7
20386 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm6
20387 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm6
20388 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm5[4,5,6,7]
20389 ; AVX512DQ-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm17
20390 ; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm8
20391 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20392 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm11, %zmm8
20393 ; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm3
20394 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20395 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm8
20396 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm6, %zmm8, %zmm5
20397 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20398 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm6
20399 ; AVX512DQ-FCP-NEXT: vmovdqa 1024(%rdi), %ymm3
20400 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
20401 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm3[0,1,2,3,4,5],ymm6[6,7]
20402 ; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm30
20403 ; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm1
20404 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
20405 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20406 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm8
20407 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm6[4,5,6,7]
20408 ; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm29
20409 ; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm10
20410 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20411 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm11, %zmm10
20412 ; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm3
20413 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20414 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm10
20415 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm5
20416 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20417 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm8
20418 ; AVX512DQ-FCP-NEXT: vmovdqa 2368(%rdi), %ymm3
20419 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
20420 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5],ymm8[6,7]
20421 ; AVX512DQ-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm18
20422 ; AVX512DQ-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm0
20423 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10
20424 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20425 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm9, %zmm10
20426 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm8[4,5,6,7]
20427 ; AVX512DQ-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm20
20428 ; AVX512DQ-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm14
20429 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20430 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm11, %zmm14
20431 ; AVX512DQ-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm21
20432 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm14
20433 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20434 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm14, %zmm5
20435 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20436 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm10
20437 ; AVX512DQ-FCP-NEXT: vmovdqa 1920(%rdi), %ymm3
20438 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
20439 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7]
20440 ; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm22
20441 ; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm5
20442 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm15
20443 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm6
20444 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20445 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm9, %zmm15
20446 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm15[0,1,2,3],ymm10[4,5,6,7]
20447 ; AVX512DQ-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm26
20448 ; AVX512DQ-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm15
20449 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20450 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm11, %zmm15
20451 ; AVX512DQ-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm28
20452 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm15
20453 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20454 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm15, %zmm5
20455 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20456 ; AVX512DQ-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm10
20457 ; AVX512DQ-FCP-NEXT: vmovdqa 3264(%rdi), %ymm3
20458 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
20459 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm3[0,1,2,3,4,5],ymm10[6,7]
20460 ; AVX512DQ-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm25
20461 ; AVX512DQ-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm14
20462 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm14, %zmm9
20463 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm9[0,1,2,3],ymm10[4,5,6,7]
20464 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
20465 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm11, %zmm5
20466 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20467 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20468 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm5
20469 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm27
20470 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm11, %zmm5
20471 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20472 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm1
20473 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20474 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm3
20475 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm5
20476 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm5
20477 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20478 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm5
20479 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm5
20480 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20481 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm0
20482 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20483 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
20484 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm9
20485 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
20486 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm9
20487 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20488 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm9
20489 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm25, %zmm11, %zmm9
20490 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20491 ; AVX512DQ-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm9
20492 ; AVX512DQ-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm0
20493 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20494 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm9, %zmm0, %zmm11
20495 ; AVX512DQ-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm6
20496 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20497 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm4, %zmm11
20498 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm11, %zmm4
20499 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20500 ; AVX512DQ-FCP-NEXT: vmovdqa 2880(%rdi), %ymm4
20501 ; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
20502 ; AVX512DQ-FCP-NEXT: # ymm0 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
20503 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm11 = [5,12]
20504 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm4
20505 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm11, %zmm4
20506 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm4[0,1,2,3],ymm0[4,5,6,7]
20507 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [9,0,7,0,9,0,7,0]
20508 ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
20509 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
20510 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm16
20511 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,14,4,5,6,14]
20512 ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
20513 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm16 # 64-byte Folded Reload
20514 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm16, %zmm5
20515 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20516 ; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm10
20517 ; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm1 # 32-byte Folded Reload
20518 ; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
20519 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10
20520 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm6
20521 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm27, %zmm11, %zmm10
20522 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20523 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm1[4,5,6,7]
20524 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm10
20525 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
20526 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm10
20527 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm10 # 64-byte Folded Reload
20528 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm10, %zmm1
20529 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20530 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm1
20531 ; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
20532 ; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
20533 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
20534 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20535 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20536 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm23, %zmm11, %zmm2
20537 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20538 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm2
20539 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm24, %zmm0, %zmm2
20540 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
20541 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
20542 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20543 ; AVX512DQ-FCP-NEXT: vmovdqa 1536(%rdi), %ymm1
20544 ; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
20545 ; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
20546 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm8
20547 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20548 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2
20549 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20550 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm19, %zmm11, %zmm2
20551 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20552 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm2
20553 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
20554 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
20555 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
20556 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20557 ; AVX512DQ-FCP-NEXT: vmovdqa 1088(%rdi), %ymm1
20558 ; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
20559 ; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
20560 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
20561 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
20562 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20563 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm30, %zmm11, %zmm2
20564 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20565 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm2
20566 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
20567 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 # 64-byte Folded Reload
20568 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
20569 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20570 ; AVX512DQ-FCP-NEXT: vmovdqa 2432(%rdi), %ymm1
20571 ; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
20572 ; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
20573 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20574 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm2
20575 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20576 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm18, %zmm11, %zmm2
20577 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20578 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm2
20579 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
20580 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm21, %zmm4, %zmm2
20581 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
20582 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20583 ; AVX512DQ-FCP-NEXT: vmovdqa 1984(%rdi), %ymm1
20584 ; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
20585 ; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
20586 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20587 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm2
20588 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20589 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm22, %zmm11, %zmm2
20590 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
20591 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm2
20592 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
20593 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm28, %zmm4, %zmm2
20594 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm2, %zmm1
20595 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20596 ; AVX512DQ-FCP-NEXT: vmovdqa 3328(%rdi), %ymm1
20597 ; AVX512DQ-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
20598 ; AVX512DQ-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
20599 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20600 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20601 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm14, %zmm11
20602 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3],ymm1[4,5,6,7]
20603 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm0, %zmm7
20604 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20605 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm2
20606 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm2
20607 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20608 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm2
20609 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm5, %zmm0, %zmm2
20610 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20611 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm2
20612 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm0, %zmm2
20613 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20614 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm2
20615 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm2
20616 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20617 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm0, %zmm18
20618 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20619 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm12, %zmm0, %zmm22
20620 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20621 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm0, %zmm25
20622 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20623 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20624 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm25
20625 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm6, %zmm9, %zmm0
20626 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm0 # 64-byte Folded Reload
20627 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
20628 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20629 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3]
20630 ; AVX512DQ-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20631 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm2
20632 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm12
20633 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm23, %zmm12
20634 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm24 = [11,4,11,4,11,4,11,4]
20635 ; AVX512DQ-FCP-NEXT: # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20636 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0
20637 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm24, %zmm0
20638 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20639 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [12,5,12,5,12,5,12,5]
20640 ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20641 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm0
20642 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm8, %zmm0
20643 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20644 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm16 = [13,6,13,6,13,6,13,6]
20645 ; AVX512DQ-FCP-NEXT: # zmm16 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20646 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm0
20647 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm16, %zmm0
20648 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20649 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
20650 ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20651 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm31, %zmm1, %zmm2
20652 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20653 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
20654 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
20655 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm0
20656 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10
20657 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
20658 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm24, %zmm0
20659 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20660 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
20661 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm8, %zmm0
20662 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20663 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
20664 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm0
20665 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20666 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm28
20667 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20668 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
20669 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
20670 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm23, %zmm0
20671 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm14
20672 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
20673 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm24, %zmm0
20674 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm18
20675 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
20676 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm8, %zmm0
20677 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20678 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm0
20679 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm0
20680 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20681 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm1, %zmm22
20682 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20683 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
20684 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm9
20685 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm23, %zmm9
20686 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
20687 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm24, %zmm0
20688 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm19
20689 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
20690 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm8, %zmm0
20691 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20692 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm0
20693 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm16, %zmm0
20694 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20695 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm29, %zmm1, %zmm30
20696 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20697 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20698 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7
20699 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm7
20700 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3
20701 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm3
20702 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm21
20703 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3
20704 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm3
20705 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20706 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm3
20707 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm3
20708 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20709 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm0
20710 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20711 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
20712 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm30
20713 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm23, %zmm30
20714 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0
20715 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm24, %zmm0
20716 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22
20717 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm5
20718 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm8, %zmm5
20719 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm0
20720 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm0
20721 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm29
20722 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm1, %zmm11
20723 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
20724 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm28
20725 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
20726 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm23, %zmm28
20727 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm31
20728 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm3
20729 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm6, %zmm23
20730 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20731 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
20732 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20733 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm27
20734 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm24, %zmm27
20735 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20736 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
20737 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm16, %zmm0
20738 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20739 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20740 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
20741 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm16, %zmm0
20742 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20743 ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
20744 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
20745 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm16, %zmm0
20746 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20747 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20748 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
20749 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm16, %zmm0
20750 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20751 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20752 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
20753 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20754 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20755 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm6 # 64-byte Folded Reload
20756 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20757 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20758 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm16, %zmm0 # 64-byte Folded Reload
20759 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20760 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm25, %zmm31, %zmm24
20761 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm25
20762 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm2
20763 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm8, %zmm25
20764 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm31, %zmm2
20765 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm0
20766 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm16, %zmm0
20767 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm3, %zmm31, %zmm16
20768 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm3, %zmm1, %zmm31
20769 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm3
20770 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm15
20771 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm31
20772 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
20773 ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
20774 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20775 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm10
20776 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20777 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20778 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
20779 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20780 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
20781 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm9
20782 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20783 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
20784 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm14
20785 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20786 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
20787 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm30
20788 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
20789 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm7
20790 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20791 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
20792 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm28
20793 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
20794 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm23
20795 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
20796 ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
20797 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm27
20798 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20799 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
20800 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20801 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20802 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12
20803 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20804 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm18
20805 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20806 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm19
20807 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20808 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm21
20809 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20810 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm22
20811 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20812 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm24
20813 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
20814 ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
20815 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm25
20816 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20817 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
20818 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20819 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20820 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12
20821 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20822 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20823 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12
20824 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20825 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20826 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm12
20827 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20828 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20829 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm12
20830 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20831 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm5
20832 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20833 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm2
20834 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20835 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
20836 ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
20837 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm0
20838 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20839 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20840 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm12
20841 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20842 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20843 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm12
20844 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20845 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20846 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm12
20847 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20848 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20849 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm12
20850 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20851 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20852 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm12
20853 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20854 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm29
20855 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20856 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm16
20857 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
20858 ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
20859 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
20860 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm6, %zmm1, %zmm18
20861 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
20862 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm19
20863 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
20864 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm9, %zmm1, %zmm21
20865 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
20866 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm10, %zmm1, %zmm22
20867 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20868 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm2
20869 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20870 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm11
20871 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20872 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm20, %zmm1, %zmm3
20873 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20874 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm7, %zmm1, %zmm31
20875 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20876 ; AVX512DQ-FCP-NEXT: movb $24, %al
20877 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k2
20878 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20879 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20880 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 {%k2} = zmm0[4,5,4,5],zmm13[4,5,4,5]
20881 ; AVX512DQ-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm15 = [7,0,9,0,7,0,9,0]
20882 ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3]
20883 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm20
20884 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm20
20885 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} ymm3 = [0,0,4,11]
20886 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm13, %zmm3, %zmm0
20887 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20888 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20889 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
20890 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm0[4,5,4,5],zmm17[4,5,4,5]
20891 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm29
20892 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm29
20893 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm17, %zmm3, %zmm0
20894 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20895 ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload
20896 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
20897 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k2} = zmm0[4,5,4,5],zmm26[4,5,4,5]
20898 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm17
20899 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm17
20900 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm26, %zmm3, %zmm0
20901 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
20902 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20903 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20904 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 {%k2} = zmm2[4,5,4,5],zmm4[4,5,4,5]
20905 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0
20906 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm15, %zmm0
20907 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20908 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm4, %zmm3, %zmm2
20909 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20910 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20911 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20912 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
20913 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5]
20914 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2
20915 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm2
20916 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20917 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
20918 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20919 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20920 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20921 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20922 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 {%k2} = zmm0[4,5,4,5],zmm1[4,5,4,5]
20923 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2
20924 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm15, %zmm2
20925 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20926 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm3, %zmm0
20927 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20928 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20929 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20930 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 # 64-byte Reload
20931 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm31 {%k2} = zmm0[4,5,4,5],zmm2[4,5,4,5]
20932 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
20933 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm26
20934 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20935 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm1, %zmm15, %zmm26
20936 ; AVX512DQ-FCP-NEXT: vpermi2q %zmm0, %zmm2, %zmm15
20937 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm2, %zmm3, %zmm0
20938 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20939 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm0
20940 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
20941 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 {%k2} = zmm1[4,5,4,5],zmm0[4,5,4,5]
20942 ; AVX512DQ-FCP-NEXT: vpermt2q %zmm0, %zmm3, %zmm1
20943 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20944 ; AVX512DQ-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13]
20945 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20946 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
20947 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20948 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
20949 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20950 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
20951 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20952 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
20953 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
20954 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm11 # 64-byte Folded Reload
20955 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
20956 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm13 # 64-byte Folded Reload
20957 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
20958 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload
20959 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20960 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
20961 ; AVX512DQ-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm14 # 64-byte Folded Reload
20962 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20963 ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
20964 ; AVX512DQ-FCP-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7]
20965 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0
20966 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20967 ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
20968 ; AVX512DQ-FCP-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7]
20969 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm19, %zmm0
20970 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20971 ; AVX512DQ-FCP-NEXT: vpblendd $240, (%rsp), %ymm3, %ymm0 # 32-byte Folded Reload
20972 ; AVX512DQ-FCP-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7]
20973 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
20974 ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
20975 ; AVX512DQ-FCP-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7]
20976 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm22, %zmm0
20977 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
20978 ; AVX512DQ-FCP-NEXT: movb $-32, %al
20979 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
20980 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20981 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1}
20982 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20983 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20984 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm12 {%k1}
20985 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20986 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20987 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm8 {%k1}
20988 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20989 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20990 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm9 {%k1}
20991 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20992 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm6 {%k1}
20993 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20994 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
20995 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1}
20996 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20997 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4 {%k1}
20998 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20999 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm31 {%k1}
21000 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21001 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21002 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
21003 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 {%k1}
21004 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21005 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21006 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21007 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
21008 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21009 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
21010 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21011 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21012 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21013 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
21014 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21015 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
21016 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21017 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21018 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21019 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
21020 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21021 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
21022 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21023 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21024 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21025 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
21026 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21027 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
21028 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21029 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
21030 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21031 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2}
21032 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21033 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k1}
21034 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21035 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21036 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k2}
21037 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21038 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm2 {%k1}
21039 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21040 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21041 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k2}
21042 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm0 {%k1}
21043 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm18
21044 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
21045 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm7 {%k1}
21046 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
21047 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21048 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm22 {%k1}
21049 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
21050 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21051 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k1}
21052 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
21053 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21054 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k1}
21055 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 # 64-byte Reload
21056 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21057 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm30 {%k1}
21058 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
21059 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21060 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm27 {%k1}
21061 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
21062 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21063 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm28 {%k1}
21064 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
21065 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21066 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k1}
21067 ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm0 # 32-byte Folded Reload
21068 ; AVX512DQ-FCP-NEXT: # ymm0 = ymm11[0,1,2,3],mem[4,5,6,7]
21069 ; AVX512DQ-FCP-NEXT: vmovdqa 2752(%rdi), %ymm3
21070 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
21071 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
21072 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm26, %zmm3
21073 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21074 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
21075 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21076 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
21077 ; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %ymm1
21078 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm1 = mem[8,9,10,11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23]
21079 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
21080 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm20, %zmm6
21081 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21082 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm6 {%k1}
21083 ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm1 # 32-byte Folded Reload
21084 ; AVX512DQ-FCP-NEXT: # ymm1 = ymm13[0,1,2,3],mem[4,5,6,7]
21085 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm4
21086 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
21087 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
21088 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm29, %zmm14
21089 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
21090 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm14 {%k1}
21091 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
21092 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
21093 ; AVX512DQ-FCP-NEXT: vmovdqa 1408(%rdi), %ymm4
21094 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
21095 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
21096 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm17, %zmm5
21097 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
21098 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm5 {%k1}
21099 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
21100 ; AVX512DQ-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
21101 ; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,2,3],ymm4[4,5,6,7]
21102 ; AVX512DQ-FCP-NEXT: vmovdqa 960(%rdi), %ymm8
21103 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
21104 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
21105 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
21106 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm9, %zmm8
21107 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
21108 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm8 {%k1}
21109 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
21110 ; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm4, %zmm9, %zmm4
21111 ; AVX512DQ-FCP-NEXT: vmovdqa 2304(%rdi), %ymm10
21112 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm10 = mem[8,9,10,11,12,13,14,15],ymm10[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm10[16,17,18,19,20,21,22,23]
21113 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10
21114 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
21115 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm10, %zmm9, %zmm10
21116 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
21117 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
21118 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
21119 ; AVX512DQ-FCP-NEXT: vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm11 # 32-byte Folded Reload
21120 ; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,2,3],ymm9[4,5,6,7]
21121 ; AVX512DQ-FCP-NEXT: vmovdqa 1856(%rdi), %ymm12
21122 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm12 = mem[8,9,10,11,12,13,14,15],ymm12[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm12[16,17,18,19,20,21,22,23]
21123 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm12, %xmm12
21124 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
21125 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm12, %zmm9, %zmm12
21126 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
21127 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm12 {%k1}
21128 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
21129 ; AVX512DQ-FCP-NEXT: vinsertf64x4 $0, %ymm11, %zmm9, %zmm11
21130 ; AVX512DQ-FCP-NEXT: vmovdqa 3200(%rdi), %ymm13
21131 ; AVX512DQ-FCP-NEXT: vpalignr {{.*#+}} ymm13 = mem[8,9,10,11,12,13,14,15],ymm13[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm13[16,17,18,19,20,21,22,23]
21132 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13
21133 ; AVX512DQ-FCP-NEXT: vinserti32x4 $0, %xmm13, %zmm15, %zmm9
21134 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm9 {%k1}
21135 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 448(%rsi)
21136 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
21137 ; AVX512DQ-FCP-NEXT: vmovaps %zmm13, 384(%rsi)
21138 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
21139 ; AVX512DQ-FCP-NEXT: vmovaps %zmm13, 320(%rsi)
21140 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
21141 ; AVX512DQ-FCP-NEXT: vmovaps %zmm13, 256(%rsi)
21142 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
21143 ; AVX512DQ-FCP-NEXT: vmovaps %zmm13, 192(%rsi)
21144 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
21145 ; AVX512DQ-FCP-NEXT: vmovaps %zmm13, 128(%rsi)
21146 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
21147 ; AVX512DQ-FCP-NEXT: vmovaps %zmm13, 64(%rsi)
21148 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
21149 ; AVX512DQ-FCP-NEXT: vmovaps %zmm13, (%rsi)
21150 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 448(%rdx)
21151 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 256(%rdx)
21152 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, 320(%rdx)
21153 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21154 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 128(%rdx)
21155 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21156 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 192(%rdx)
21157 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21158 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rdx)
21159 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21160 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rdx)
21161 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21162 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 384(%rdx)
21163 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, 448(%rcx)
21164 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 256(%rcx)
21165 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, 320(%rcx)
21166 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 128(%rcx)
21167 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 192(%rcx)
21168 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rcx)
21169 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%rcx)
21170 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, 384(%rcx)
21171 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 448(%r8)
21172 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 256(%r8)
21173 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 320(%r8)
21174 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 128(%r8)
21175 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 192(%r8)
21176 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, (%r8)
21177 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 64(%r8)
21178 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 384(%r8)
21179 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
21180 ; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 448(%r9)
21181 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
21182 ; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 256(%r9)
21183 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
21184 ; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 320(%r9)
21185 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
21186 ; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 128(%r9)
21187 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
21188 ; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 192(%r9)
21189 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
21190 ; AVX512DQ-FCP-NEXT: vmovaps %zmm3, (%r9)
21191 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
21192 ; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 64(%r9)
21193 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
21194 ; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 384(%r9)
21195 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
21196 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21197 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 448(%rax)
21198 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21199 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 256(%rax)
21200 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21201 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 320(%rax)
21202 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21203 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 128(%rax)
21204 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21205 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 192(%rax)
21206 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21207 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rax)
21208 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21209 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rax)
21210 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
21211 ; AVX512DQ-FCP-NEXT: vmovaps %zmm3, 384(%rax)
21212 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
21213 ; AVX512DQ-FCP-NEXT: vmovaps %zmm11, 384(%rax)
21214 ; AVX512DQ-FCP-NEXT: vmovaps %zmm4, 448(%rax)
21215 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 256(%rax)
21216 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 320(%rax)
21217 ; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
21218 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 128(%rax)
21219 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 192(%rax)
21220 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21221 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, (%rax)
21222 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21223 ; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 64(%rax)
21224 ; AVX512DQ-FCP-NEXT: addq $7624, %rsp # imm = 0x1DC8
21225 ; AVX512DQ-FCP-NEXT: vzeroupper
21226 ; AVX512DQ-FCP-NEXT: retq
21228 ; AVX512BW-LABEL: load_i64_stride7_vf64:
21229 ; AVX512BW: # %bb.0:
21230 ; AVX512BW-NEXT: subq $7624, %rsp # imm = 0x1DC8
21231 ; AVX512BW-NEXT: vmovdqa64 3328(%rdi), %zmm4
21232 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21233 ; AVX512BW-NEXT: vmovdqa64 3264(%rdi), %zmm6
21234 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21235 ; AVX512BW-NEXT: vmovdqa64 3008(%rdi), %zmm25
21236 ; AVX512BW-NEXT: vmovdqa64 2944(%rdi), %zmm18
21237 ; AVX512BW-NEXT: vmovdqa64 2880(%rdi), %zmm2
21238 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21239 ; AVX512BW-NEXT: vmovdqa64 2816(%rdi), %zmm1
21240 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21241 ; AVX512BW-NEXT: vmovdqa64 2752(%rdi), %zmm30
21242 ; AVX512BW-NEXT: vmovdqa64 2688(%rdi), %zmm5
21243 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21244 ; AVX512BW-NEXT: vmovdqa64 2432(%rdi), %zmm19
21245 ; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21246 ; AVX512BW-NEXT: vmovdqa64 2368(%rdi), %zmm8
21247 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21248 ; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm9
21249 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21250 ; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm3
21251 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21252 ; AVX512BW-NEXT: vmovdqa64 1536(%rdi), %zmm10
21253 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21254 ; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm12
21255 ; AVX512BW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill
21256 ; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm11
21257 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21258 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm13
21259 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21260 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm16
21261 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21262 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm14
21263 ; AVX512BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21264 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm17
21265 ; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21266 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm15
21267 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21268 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
21269 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
21270 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
21271 ; AVX512BW-NEXT: vmovdqa 2704(%rdi), %xmm2
21272 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
21273 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
21274 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21275 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1
21276 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
21277 ; AVX512BW-NEXT: vmovdqa 464(%rdi), %xmm2
21278 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
21279 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
21280 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21281 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1
21282 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
21283 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm2
21284 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
21285 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
21286 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21287 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1
21288 ; AVX512BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1
21289 ; AVX512BW-NEXT: vmovdqa 1360(%rdi), %xmm2
21290 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
21291 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
21292 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21293 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1
21294 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm1
21295 ; AVX512BW-NEXT: vmovdqa 912(%rdi), %xmm2
21296 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
21297 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
21298 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21299 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1
21300 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1
21301 ; AVX512BW-NEXT: vmovdqa 2256(%rdi), %xmm2
21302 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
21303 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
21304 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21305 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1
21306 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1
21307 ; AVX512BW-NEXT: vmovdqa 1808(%rdi), %xmm2
21308 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
21309 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
21310 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21311 ; AVX512BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm0
21312 ; AVX512BW-NEXT: vmovdqa 3152(%rdi), %xmm1
21313 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
21314 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
21315 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21316 ; AVX512BW-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm0
21317 ; AVX512BW-NEXT: vmovdqa 2816(%rdi), %ymm1
21318 ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
21319 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
21320 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
21321 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1
21322 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm1
21323 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
21324 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0]
21325 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
21326 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm16
21327 ; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21328 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm1
21329 ; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21330 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm10, %zmm1
21331 ; AVX512BW-NEXT: vmovdqa64 3072(%rdi), %zmm2
21332 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21333 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
21334 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
21335 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
21336 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
21337 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21338 ; AVX512BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0
21339 ; AVX512BW-NEXT: vmovdqa 576(%rdi), %ymm1
21340 ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
21341 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
21342 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm27
21343 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm2
21344 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1
21345 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12
21346 ; AVX512BW-NEXT: vpermt2q %zmm27, %zmm9, %zmm1
21347 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
21348 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm28
21349 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm1
21350 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21351 ; AVX512BW-NEXT: vpermt2q %zmm28, %zmm10, %zmm1
21352 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm2
21353 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21354 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
21355 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
21356 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21357 ; AVX512BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0
21358 ; AVX512BW-NEXT: vmovdqa 128(%rdi), %ymm1
21359 ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
21360 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
21361 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm31
21362 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm8
21363 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1
21364 ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm9, %zmm1
21365 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
21366 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm18
21367 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm29
21368 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm1
21369 ; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21370 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm10, %zmm1
21371 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm2
21372 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21373 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
21374 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
21375 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21376 ; AVX512BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0
21377 ; AVX512BW-NEXT: vmovdqa 1472(%rdi), %ymm1
21378 ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
21379 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
21380 ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm26
21381 ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm7
21382 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1
21383 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm1
21384 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
21385 ; AVX512BW-NEXT: vmovdqa64 1664(%rdi), %zmm13
21386 ; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm1
21387 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21388 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1
21389 ; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm2
21390 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21391 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
21392 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
21393 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21394 ; AVX512BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
21395 ; AVX512BW-NEXT: vmovdqa 1024(%rdi), %ymm1
21396 ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
21397 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
21398 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm19
21399 ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm6
21400 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1
21401 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21402 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm9, %zmm1
21403 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
21404 ; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm22
21405 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm1
21406 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21407 ; AVX512BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm1
21408 ; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm2
21409 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21410 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
21411 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
21412 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21413 ; AVX512BW-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm0
21414 ; AVX512BW-NEXT: vmovdqa 2368(%rdi), %ymm1
21415 ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
21416 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
21417 ; AVX512BW-NEXT: vmovdqa64 2304(%rdi), %zmm20
21418 ; AVX512BW-NEXT: vmovdqa64 2240(%rdi), %zmm5
21419 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1
21420 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21421 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm9, %zmm1
21422 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
21423 ; AVX512BW-NEXT: vmovdqa64 2560(%rdi), %zmm15
21424 ; AVX512BW-NEXT: vmovdqa64 2496(%rdi), %zmm1
21425 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21426 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm10, %zmm1
21427 ; AVX512BW-NEXT: vmovdqa64 2624(%rdi), %zmm3
21428 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21429 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1
21430 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
21431 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21432 ; AVX512BW-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm0
21433 ; AVX512BW-NEXT: vmovdqa 1920(%rdi), %ymm1
21434 ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
21435 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
21436 ; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm21
21437 ; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm2
21438 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1
21439 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21440 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm9, %zmm1
21441 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
21442 ; AVX512BW-NEXT: vmovdqa64 2112(%rdi), %zmm14
21443 ; AVX512BW-NEXT: vmovdqa64 2048(%rdi), %zmm1
21444 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21445 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm10, %zmm1
21446 ; AVX512BW-NEXT: vmovdqa64 2176(%rdi), %zmm23
21447 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm1
21448 ; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21449 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
21450 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21451 ; AVX512BW-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm0
21452 ; AVX512BW-NEXT: vmovdqa 3264(%rdi), %ymm1
21453 ; AVX512BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
21454 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
21455 ; AVX512BW-NEXT: vmovdqa64 3200(%rdi), %zmm17
21456 ; AVX512BW-NEXT: vmovdqa64 3136(%rdi), %zmm11
21457 ; AVX512BW-NEXT: vpermi2q %zmm17, %zmm11, %zmm9
21458 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21459 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
21460 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm9
21461 ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm10, %zmm9
21462 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21463 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1
21464 ; AVX512BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21465 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm9
21466 ; AVX512BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm9
21467 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21468 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm10, %zmm6
21469 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21470 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9
21471 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm10, %zmm9
21472 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21473 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm10, %zmm2
21474 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21475 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm10, %zmm5
21476 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21477 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
21478 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9
21479 ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm6
21480 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm9
21481 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21482 ; AVX512BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm11
21483 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21484 ; AVX512BW-NEXT: vmovdqa64 3456(%rdi), %zmm24
21485 ; AVX512BW-NEXT: vmovdqa64 3392(%rdi), %zmm2
21486 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21487 ; AVX512BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm10
21488 ; AVX512BW-NEXT: vmovdqa64 3520(%rdi), %zmm30
21489 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm4, %zmm10
21490 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
21491 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21492 ; AVX512BW-NEXT: vmovdqa 2880(%rdi), %ymm0
21493 ; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
21494 ; AVX512BW-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
21495 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12]
21496 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm9
21497 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm9
21498 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21499 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7]
21500 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0]
21501 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
21502 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm12
21503 ; AVX512BW-NEXT: vpermt2q %zmm16, %zmm4, %zmm12
21504 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14]
21505 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
21506 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
21507 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5
21508 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21509 ; AVX512BW-NEXT: vmovdqa 640(%rdi), %ymm9
21510 ; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
21511 ; AVX512BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
21512 ; AVX512BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1
21513 ; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21514 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7]
21515 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm12
21516 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
21517 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm4, %zmm12
21518 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
21519 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5
21520 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21521 ; AVX512BW-NEXT: vmovdqa 192(%rdi), %ymm9
21522 ; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
21523 ; AVX512BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
21524 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21525 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm12
21526 ; AVX512BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21527 ; AVX512BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm12
21528 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
21529 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm12
21530 ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm12
21531 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
21532 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5
21533 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21534 ; AVX512BW-NEXT: vmovdqa 1536(%rdi), %ymm9
21535 ; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
21536 ; AVX512BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
21537 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm5
21538 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21539 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12
21540 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm16
21541 ; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21542 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm12
21543 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
21544 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm12
21545 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
21546 ; AVX512BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm12
21547 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
21548 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm7
21549 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21550 ; AVX512BW-NEXT: vmovdqa 1088(%rdi), %ymm9
21551 ; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
21552 ; AVX512BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
21553 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
21554 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12
21555 ; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21556 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm12
21557 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
21558 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm12
21559 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
21560 ; AVX512BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm12
21561 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
21562 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9
21563 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21564 ; AVX512BW-NEXT: vmovdqa 2432(%rdi), %ymm9
21565 ; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload
21566 ; AVX512BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
21567 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
21568 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm9
21569 ; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21570 ; AVX512BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm9
21571 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
21572 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9
21573 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm9 # 64-byte Folded Reload
21574 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm9 # 64-byte Folded Reload
21575 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm9, %zmm2
21576 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21577 ; AVX512BW-NEXT: vmovdqa 1984(%rdi), %ymm2
21578 ; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
21579 ; AVX512BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
21580 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
21581 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm3
21582 ; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21583 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm3
21584 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
21585 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm3
21586 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload
21587 ; AVX512BW-NEXT: vpermt2q %zmm23, %zmm10, %zmm3
21588 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
21589 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21590 ; AVX512BW-NEXT: vmovdqa 3328(%rdi), %ymm2
21591 ; AVX512BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload
21592 ; AVX512BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
21593 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21594 ; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21595 ; AVX512BW-NEXT: vpermi2q %zmm17, %zmm2, %zmm0
21596 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
21597 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm6 # 64-byte Folded Reload
21598 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21599 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm1
21600 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
21601 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21602 ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm1
21603 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm4, %zmm1
21604 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21605 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1
21606 ; AVX512BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm1
21607 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21608 ; AVX512BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm19
21609 ; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21610 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm20
21611 ; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21612 ; AVX512BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm21
21613 ; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21614 ; AVX512BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm17
21615 ; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21616 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
21617 ; AVX512BW-NEXT: vpermi2q %zmm8, %zmm24, %zmm4
21618 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm4
21619 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
21620 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21621 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3]
21622 ; AVX512BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
21623 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm0
21624 ; AVX512BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm25
21625 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [11,4,11,4,11,4,11,4]
21626 ; AVX512BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
21627 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1
21628 ; AVX512BW-NEXT: vpermt2q %zmm28, %zmm27, %zmm1
21629 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21630 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [12,5,12,5,12,5,12,5]
21631 ; AVX512BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
21632 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1
21633 ; AVX512BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm1
21634 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21635 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6]
21636 ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
21637 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1
21638 ; AVX512BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm1
21639 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21640 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
21641 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
21642 ; AVX512BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm0
21643 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21644 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
21645 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0
21646 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm0
21647 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21648 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0
21649 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm27, %zmm0
21650 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21651 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0
21652 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm31, %zmm0
21653 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21654 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm0
21655 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm0
21656 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21657 ; AVX512BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm28
21658 ; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21659 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0
21660 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm0
21661 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm28
21662 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0
21663 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm27, %zmm0
21664 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21665 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0
21666 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm31, %zmm0
21667 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21668 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0
21669 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm0
21670 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21671 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm26
21672 ; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21673 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm9
21674 ; AVX512BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm9
21675 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0
21676 ; AVX512BW-NEXT: vpermt2q %zmm22, %zmm27, %zmm0
21677 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21678 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0
21679 ; AVX512BW-NEXT: vpermt2q %zmm22, %zmm31, %zmm0
21680 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm17
21681 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm0
21682 ; AVX512BW-NEXT: vpermt2q %zmm22, %zmm12, %zmm0
21683 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16
21684 ; AVX512BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm29
21685 ; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21686 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
21687 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0
21688 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm0
21689 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19
21690 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm10
21691 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm27, %zmm10
21692 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm13
21693 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm13
21694 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm0
21695 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm0
21696 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21697 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm21
21698 ; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21699 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
21700 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm2
21701 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm2
21702 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm22
21703 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4
21704 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm4
21705 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm3
21706 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm31, %zmm3
21707 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm20
21708 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm7
21709 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm7
21710 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm5
21711 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
21712 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm3
21713 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
21714 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm3
21715 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm29
21716 ; AVX512BW-NEXT: vpermi2q %zmm24, %zmm8, %zmm23
21717 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21718 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload
21719 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21720 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm2
21721 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm2
21722 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
21723 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
21724 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm6
21725 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21726 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
21727 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
21728 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21729 ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
21730 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
21731 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21732 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
21733 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
21734 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21735 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
21736 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
21737 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21738 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
21739 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
21740 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm8
21741 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21742 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
21743 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm8 # 64-byte Folded Reload
21744 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21745 ; AVX512BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm27
21746 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm0
21747 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm31, %zmm0
21748 ; AVX512BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm31
21749 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm8
21750 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm8
21751 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm18
21752 ; AVX512BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm12
21753 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm29
21754 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm26
21755 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
21756 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
21757 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
21758 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
21759 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm11
21760 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21761 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
21762 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm25
21763 ; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21764 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
21765 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm9
21766 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21767 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
21768 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm28
21769 ; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21770 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
21771 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm22
21772 ; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21773 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
21774 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm19
21775 ; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21776 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
21777 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm3
21778 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21779 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm23
21780 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
21781 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
21782 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm2
21783 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21784 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
21785 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28
21786 ; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21787 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21788 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm2
21789 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21790 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
21791 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm9
21792 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21793 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
21794 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm9
21795 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21796 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm10
21797 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21798 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm4
21799 ; AVX512BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21800 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm27
21801 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
21802 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
21803 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm0
21804 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21805 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
21806 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28
21807 ; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21808 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
21809 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm28
21810 ; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21811 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
21812 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm9
21813 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21814 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm17
21815 ; AVX512BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21816 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm13
21817 ; AVX512BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21818 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm20
21819 ; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21820 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm31
21821 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
21822 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
21823 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm18
21824 ; AVX512BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21825 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
21826 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28
21827 ; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21828 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
21829 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm28
21830 ; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21831 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
21832 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm28
21833 ; AVX512BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21834 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm16
21835 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21836 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
21837 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm9
21838 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21839 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm7
21840 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21841 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm12
21842 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
21843 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
21844 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
21845 ; AVX512BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm16
21846 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
21847 ; AVX512BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17
21848 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
21849 ; AVX512BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm18
21850 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
21851 ; AVX512BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm20
21852 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
21853 ; AVX512BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm21
21854 ; AVX512BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm5
21855 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21856 ; AVX512BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm29
21857 ; AVX512BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm26
21858 ; AVX512BW-NEXT: movb $24, %al
21859 ; AVX512BW-NEXT: kmovd %eax, %k1
21860 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21861 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
21862 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5]
21863 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [7,0,9,0,7,0,9,0]
21864 ; AVX512BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3]
21865 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm0
21866 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm0
21867 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21868 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,11]
21869 ; AVX512BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm1
21870 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21871 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
21872 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21873 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
21874 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
21875 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
21876 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm0
21877 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21878 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
21879 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21880 ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
21881 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21882 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
21883 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
21884 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
21885 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm0
21886 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21887 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
21888 ; AVX512BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
21889 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
21890 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21891 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
21892 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
21893 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0
21894 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm0
21895 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21896 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
21897 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21898 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21899 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21900 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
21901 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5]
21902 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm3
21903 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm3
21904 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21905 ; AVX512BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
21906 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21907 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21908 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
21909 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[4,5,4,5],zmm6[4,5,4,5]
21910 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3
21911 ; AVX512BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm3
21912 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21913 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
21914 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21915 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
21916 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
21917 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
21918 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm15[4,5,4,5],zmm6[4,5,4,5]
21919 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
21920 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm28
21921 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
21922 ; AVX512BW-NEXT: vpermt2q %zmm13, %zmm30, %zmm28
21923 ; AVX512BW-NEXT: vpermi2q %zmm15, %zmm6, %zmm30
21924 ; AVX512BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm15
21925 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
21926 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5]
21927 ; AVX512BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13
21928 ; AVX512BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13]
21929 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
21930 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
21931 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
21932 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
21933 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21934 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
21935 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
21936 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
21937 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
21938 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
21939 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
21940 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
21941 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
21942 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload
21943 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
21944 ; AVX512BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload
21945 ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
21946 ; AVX512BW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7]
21947 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0
21948 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21949 ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
21950 ; AVX512BW-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7]
21951 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17
21952 ; AVX512BW-NEXT: vpblendd $240, (%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
21953 ; AVX512BW-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7]
21954 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0
21955 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21956 ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
21957 ; AVX512BW-NEXT: # ymm0 = ymm4[0,1,2,3],mem[4,5,6,7]
21958 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
21959 ; AVX512BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
21960 ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
21961 ; AVX512BW-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7]
21962 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
21963 ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
21964 ; AVX512BW-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7]
21965 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
21966 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
21967 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21968 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm15[4,5,6,7]
21969 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm29
21970 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5,6,7]
21971 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
21972 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21973 ; AVX512BW-NEXT: movb $-32, %al
21974 ; AVX512BW-NEXT: kmovd %eax, %k2
21975 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21976 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2}
21977 ; AVX512BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21978 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm19
21979 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21980 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2}
21981 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21982 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2}
21983 ; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21984 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm24
21985 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21986 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2}
21987 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21988 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2}
21989 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21990 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21991 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2}
21992 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21993 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
21994 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2}
21995 ; AVX512BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
21996 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2}
21997 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm25
21998 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
21999 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22000 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1}
22001 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22002 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2}
22003 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm22
22004 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22005 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22006 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
22007 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22008 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
22009 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22010 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22011 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22012 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
22013 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22014 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
22015 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22016 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
22017 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22018 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1}
22019 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22020 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2}
22021 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
22022 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22023 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
22024 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22025 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2}
22026 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
22027 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22028 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1}
22029 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22030 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2}
22031 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
22032 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22033 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
22034 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22035 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2}
22036 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22037 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22038 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
22039 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2}
22040 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm9
22041 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
22042 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22043 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2}
22044 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
22045 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22046 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2}
22047 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
22048 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22049 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2}
22050 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
22051 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22052 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2}
22053 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
22054 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22055 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2}
22056 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
22057 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22058 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2}
22059 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
22060 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22061 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2}
22062 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
22063 ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm27 {%k2}
22064 ; AVX512BW-NEXT: vmovdqa 2752(%rdi), %ymm0
22065 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
22066 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
22067 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm28, %zmm1
22068 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22069 ; AVX512BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
22070 ; AVX512BW-NEXT: vmovdqa 512(%rdi), %ymm0
22071 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
22072 ; AVX512BW-NEXT: vextracti128 $1, %ymm0, %xmm0
22073 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
22074 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0
22075 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
22076 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2}
22077 ; AVX512BW-NEXT: vmovdqa 64(%rdi), %ymm3
22078 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
22079 ; AVX512BW-NEXT: vextracti128 $1, %ymm3, %xmm3
22080 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
22081 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm3, %zmm4, %zmm3
22082 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
22083 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2}
22084 ; AVX512BW-NEXT: vmovdqa 1408(%rdi), %ymm4
22085 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
22086 ; AVX512BW-NEXT: vextracti128 $1, %ymm4, %xmm4
22087 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
22088 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm4, %zmm5, %zmm4
22089 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
22090 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2}
22091 ; AVX512BW-NEXT: vmovdqa 960(%rdi), %ymm5
22092 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
22093 ; AVX512BW-NEXT: vextracti128 $1, %ymm5, %xmm5
22094 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
22095 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5
22096 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
22097 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k2}
22098 ; AVX512BW-NEXT: vmovdqa 2304(%rdi), %ymm6
22099 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
22100 ; AVX512BW-NEXT: vextracti128 $1, %ymm6, %xmm6
22101 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
22102 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm6, %zmm7, %zmm6
22103 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
22104 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2}
22105 ; AVX512BW-NEXT: vmovdqa 1856(%rdi), %ymm7
22106 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
22107 ; AVX512BW-NEXT: vextracti128 $1, %ymm7, %xmm7
22108 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
22109 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm7, %zmm8, %zmm7
22110 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
22111 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2}
22112 ; AVX512BW-NEXT: vmovdqa 3200(%rdi), %ymm8
22113 ; AVX512BW-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
22114 ; AVX512BW-NEXT: vextracti128 $1, %ymm8, %xmm8
22115 ; AVX512BW-NEXT: vinserti32x4 $0, %xmm8, %zmm30, %zmm8
22116 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm8 {%k2}
22117 ; AVX512BW-NEXT: vmovdqa64 %zmm25, 448(%rsi)
22118 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
22119 ; AVX512BW-NEXT: vmovaps %zmm12, 384(%rsi)
22120 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
22121 ; AVX512BW-NEXT: vmovaps %zmm12, 320(%rsi)
22122 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
22123 ; AVX512BW-NEXT: vmovaps %zmm12, 256(%rsi)
22124 ; AVX512BW-NEXT: vmovdqa64 %zmm24, 192(%rsi)
22125 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
22126 ; AVX512BW-NEXT: vmovaps %zmm12, 128(%rsi)
22127 ; AVX512BW-NEXT: vmovdqa64 %zmm19, 64(%rsi)
22128 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
22129 ; AVX512BW-NEXT: vmovaps %zmm12, (%rsi)
22130 ; AVX512BW-NEXT: vmovdqa64 %zmm9, 448(%rdx)
22131 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rdx)
22132 ; AVX512BW-NEXT: vmovdqa64 %zmm10, 320(%rdx)
22133 ; AVX512BW-NEXT: vmovdqa64 %zmm11, 128(%rdx)
22134 ; AVX512BW-NEXT: vmovdqa64 %zmm16, 192(%rdx)
22135 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
22136 ; AVX512BW-NEXT: vmovaps %zmm2, (%rdx)
22137 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
22138 ; AVX512BW-NEXT: vmovaps %zmm2, 64(%rdx)
22139 ; AVX512BW-NEXT: vmovdqa64 %zmm22, 384(%rdx)
22140 ; AVX512BW-NEXT: vmovdqa64 %zmm27, 448(%rcx)
22141 ; AVX512BW-NEXT: vmovdqa64 %zmm26, 256(%rcx)
22142 ; AVX512BW-NEXT: vmovdqa64 %zmm23, 320(%rcx)
22143 ; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%rcx)
22144 ; AVX512BW-NEXT: vmovdqa64 %zmm18, 192(%rcx)
22145 ; AVX512BW-NEXT: vmovdqa64 %zmm15, (%rcx)
22146 ; AVX512BW-NEXT: vmovdqa64 %zmm14, 64(%rcx)
22147 ; AVX512BW-NEXT: vmovdqa64 %zmm13, 384(%rcx)
22148 ; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%r8)
22149 ; AVX512BW-NEXT: vmovdqa64 %zmm7, 256(%r8)
22150 ; AVX512BW-NEXT: vmovdqa64 %zmm6, 320(%r8)
22151 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%r8)
22152 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 192(%r8)
22153 ; AVX512BW-NEXT: vmovdqa64 %zmm3, (%r8)
22154 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%r8)
22155 ; AVX512BW-NEXT: vmovdqa64 %zmm1, 384(%r8)
22156 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22157 ; AVX512BW-NEXT: vmovaps %zmm0, 448(%r9)
22158 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22159 ; AVX512BW-NEXT: vmovaps %zmm0, 256(%r9)
22160 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22161 ; AVX512BW-NEXT: vmovaps %zmm0, 320(%r9)
22162 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22163 ; AVX512BW-NEXT: vmovaps %zmm0, 128(%r9)
22164 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22165 ; AVX512BW-NEXT: vmovaps %zmm0, 192(%r9)
22166 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22167 ; AVX512BW-NEXT: vmovaps %zmm0, (%r9)
22168 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22169 ; AVX512BW-NEXT: vmovaps %zmm0, 64(%r9)
22170 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22171 ; AVX512BW-NEXT: vmovaps %zmm0, 384(%r9)
22172 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
22173 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22174 ; AVX512BW-NEXT: vmovaps %zmm0, 448(%rax)
22175 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22176 ; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax)
22177 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22178 ; AVX512BW-NEXT: vmovaps %zmm0, 320(%rax)
22179 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22180 ; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax)
22181 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22182 ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax)
22183 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22184 ; AVX512BW-NEXT: vmovaps %zmm0, (%rax)
22185 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22186 ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax)
22187 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22188 ; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax)
22189 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
22190 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22191 ; AVX512BW-NEXT: vmovaps %zmm0, 384(%rax)
22192 ; AVX512BW-NEXT: vmovdqa64 %zmm29, 448(%rax)
22193 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22194 ; AVX512BW-NEXT: vmovaps %zmm0, 256(%rax)
22195 ; AVX512BW-NEXT: vmovdqa64 %zmm21, 320(%rax)
22196 ; AVX512BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
22197 ; AVX512BW-NEXT: vmovaps %zmm0, 128(%rax)
22198 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22199 ; AVX512BW-NEXT: vmovaps %zmm0, 192(%rax)
22200 ; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rax)
22201 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22202 ; AVX512BW-NEXT: vmovaps %zmm0, 64(%rax)
22203 ; AVX512BW-NEXT: addq $7624, %rsp # imm = 0x1DC8
22204 ; AVX512BW-NEXT: vzeroupper
22205 ; AVX512BW-NEXT: retq
22207 ; AVX512BW-FCP-LABEL: load_i64_stride7_vf64:
22208 ; AVX512BW-FCP: # %bb.0:
22209 ; AVX512BW-FCP-NEXT: subq $7624, %rsp # imm = 0x1DC8
22210 ; AVX512BW-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm4
22211 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22212 ; AVX512BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm6
22213 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22214 ; AVX512BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm25
22215 ; AVX512BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm18
22216 ; AVX512BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2
22217 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22218 ; AVX512BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1
22219 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22220 ; AVX512BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm30
22221 ; AVX512BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm5
22222 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22223 ; AVX512BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm19
22224 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22225 ; AVX512BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm8
22226 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22227 ; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm9
22228 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22229 ; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3
22230 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22231 ; AVX512BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm10
22232 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22233 ; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm12
22234 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill
22235 ; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm11
22236 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22237 ; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm13
22238 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22239 ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16
22240 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22241 ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14
22242 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22243 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm17
22244 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22245 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm15
22246 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22247 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
22248 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
22249 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
22250 ; AVX512BW-FCP-NEXT: vmovdqa 2704(%rdi), %xmm2
22251 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
22252 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
22253 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22254 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1
22255 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
22256 ; AVX512BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm2
22257 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
22258 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
22259 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22260 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1
22261 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
22262 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
22263 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
22264 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
22265 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22266 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1
22267 ; AVX512BW-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm1
22268 ; AVX512BW-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2
22269 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
22270 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
22271 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22272 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1
22273 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1
22274 ; AVX512BW-FCP-NEXT: vmovdqa 912(%rdi), %xmm2
22275 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
22276 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
22277 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22278 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
22279 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm1
22280 ; AVX512BW-FCP-NEXT: vmovdqa 2256(%rdi), %xmm2
22281 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
22282 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
22283 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22284 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
22285 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm1
22286 ; AVX512BW-FCP-NEXT: vmovdqa 1808(%rdi), %xmm2
22287 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
22288 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
22289 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22290 ; AVX512BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm0
22291 ; AVX512BW-FCP-NEXT: vmovdqa 3152(%rdi), %xmm1
22292 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
22293 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
22294 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22295 ; AVX512BW-FCP-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm0
22296 ; AVX512BW-FCP-NEXT: vmovdqa 2816(%rdi), %ymm1
22297 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
22298 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
22299 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
22300 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
22301 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm1
22302 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
22303 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0]
22304 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
22305 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm16
22306 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22307 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1
22308 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22309 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm10, %zmm1
22310 ; AVX512BW-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2
22311 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22312 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
22313 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
22314 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
22315 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
22316 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22317 ; AVX512BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0
22318 ; AVX512BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm1
22319 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
22320 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
22321 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm27
22322 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2
22323 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1
22324 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
22325 ; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm1
22326 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
22327 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28
22328 ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm1
22329 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22330 ; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm10, %zmm1
22331 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm2
22332 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22333 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
22334 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
22335 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22336 ; AVX512BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0
22337 ; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
22338 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
22339 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
22340 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31
22341 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8
22342 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
22343 ; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm9, %zmm1
22344 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
22345 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm18
22346 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm29
22347 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1
22348 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22349 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm10, %zmm1
22350 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2
22351 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22352 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
22353 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
22354 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22355 ; AVX512BW-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0
22356 ; AVX512BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm1
22357 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
22358 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
22359 ; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm26
22360 ; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm7
22361 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1
22362 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm1
22363 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
22364 ; AVX512BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm13
22365 ; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm1
22366 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22367 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm1
22368 ; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2
22369 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22370 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
22371 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
22372 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22373 ; AVX512BW-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
22374 ; AVX512BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm1
22375 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
22376 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
22377 ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm19
22378 ; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6
22379 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1
22380 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22381 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm1
22382 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
22383 ; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm22
22384 ; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1
22385 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22386 ; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm1
22387 ; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2
22388 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22389 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
22390 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
22391 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22392 ; AVX512BW-FCP-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm0
22393 ; AVX512BW-FCP-NEXT: vmovdqa 2368(%rdi), %ymm1
22394 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
22395 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
22396 ; AVX512BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm20
22397 ; AVX512BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm5
22398 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
22399 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22400 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm9, %zmm1
22401 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
22402 ; AVX512BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm15
22403 ; AVX512BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1
22404 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22405 ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm1
22406 ; AVX512BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm3
22407 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22408 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm1
22409 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
22410 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22411 ; AVX512BW-FCP-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm0
22412 ; AVX512BW-FCP-NEXT: vmovdqa 1920(%rdi), %ymm1
22413 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
22414 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
22415 ; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21
22416 ; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm2
22417 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1
22418 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22419 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm9, %zmm1
22420 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
22421 ; AVX512BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm14
22422 ; AVX512BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm1
22423 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22424 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm10, %zmm1
22425 ; AVX512BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm23
22426 ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm4, %zmm1
22427 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22428 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
22429 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22430 ; AVX512BW-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm0
22431 ; AVX512BW-FCP-NEXT: vmovdqa 3264(%rdi), %ymm1
22432 ; AVX512BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
22433 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
22434 ; AVX512BW-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm17
22435 ; AVX512BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm11
22436 ; AVX512BW-FCP-NEXT: vpermi2q %zmm17, %zmm11, %zmm9
22437 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22438 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
22439 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9
22440 ; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm10, %zmm9
22441 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22442 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1
22443 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22444 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm9
22445 ; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm9
22446 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22447 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm10, %zmm6
22448 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22449 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9
22450 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm9
22451 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22452 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm10, %zmm2
22453 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22454 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm5
22455 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22456 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
22457 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9
22458 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm6
22459 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm9
22460 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22461 ; AVX512BW-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm11
22462 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22463 ; AVX512BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm24
22464 ; AVX512BW-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm2
22465 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22466 ; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm10
22467 ; AVX512BW-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm30
22468 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm4, %zmm10
22469 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
22470 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22471 ; AVX512BW-FCP-NEXT: vmovdqa 2880(%rdi), %ymm0
22472 ; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
22473 ; AVX512BW-FCP-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
22474 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12]
22475 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9
22476 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm9
22477 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22478 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7]
22479 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0]
22480 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
22481 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm12
22482 ; AVX512BW-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm12
22483 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14]
22484 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
22485 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
22486 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5
22487 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22488 ; AVX512BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm9
22489 ; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
22490 ; AVX512BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
22491 ; AVX512BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1
22492 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22493 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7]
22494 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm12
22495 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
22496 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm4, %zmm12
22497 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
22498 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5
22499 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22500 ; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm9
22501 ; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
22502 ; AVX512BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
22503 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22504 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12
22505 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22506 ; AVX512BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm12
22507 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
22508 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm12
22509 ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm12
22510 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
22511 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5
22512 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22513 ; AVX512BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm9
22514 ; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
22515 ; AVX512BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
22516 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5
22517 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22518 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12
22519 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm16
22520 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22521 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm12
22522 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
22523 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm12
22524 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
22525 ; AVX512BW-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm12
22526 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
22527 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm7
22528 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22529 ; AVX512BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm9
22530 ; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
22531 ; AVX512BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
22532 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
22533 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12
22534 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22535 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm12
22536 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
22537 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm12
22538 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
22539 ; AVX512BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm12
22540 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
22541 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9
22542 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22543 ; AVX512BW-FCP-NEXT: vmovdqa 2432(%rdi), %ymm9
22544 ; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload
22545 ; AVX512BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
22546 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
22547 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9
22548 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22549 ; AVX512BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm9
22550 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
22551 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9
22552 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm9 # 64-byte Folded Reload
22553 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm9 # 64-byte Folded Reload
22554 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm9, %zmm2
22555 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22556 ; AVX512BW-FCP-NEXT: vmovdqa 1984(%rdi), %ymm2
22557 ; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
22558 ; AVX512BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
22559 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
22560 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3
22561 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22562 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm3
22563 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
22564 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3
22565 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload
22566 ; AVX512BW-FCP-NEXT: vpermt2q %zmm23, %zmm10, %zmm3
22567 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
22568 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22569 ; AVX512BW-FCP-NEXT: vmovdqa 3328(%rdi), %ymm2
22570 ; AVX512BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload
22571 ; AVX512BW-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
22572 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
22573 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22574 ; AVX512BW-FCP-NEXT: vpermi2q %zmm17, %zmm2, %zmm0
22575 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
22576 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm6 # 64-byte Folded Reload
22577 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22578 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1
22579 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
22580 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22581 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1
22582 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm1
22583 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22584 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1
22585 ; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm1
22586 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22587 ; AVX512BW-FCP-NEXT: vpermt2q %zmm7, %zmm4, %zmm19
22588 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22589 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm20
22590 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22591 ; AVX512BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm21
22592 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22593 ; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm17
22594 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22595 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
22596 ; AVX512BW-FCP-NEXT: vpermi2q %zmm8, %zmm24, %zmm4
22597 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm4
22598 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
22599 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22600 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3]
22601 ; AVX512BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
22602 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0
22603 ; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm25
22604 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [11,4,11,4,11,4,11,4]
22605 ; AVX512BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
22606 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1
22607 ; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm27, %zmm1
22608 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22609 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [12,5,12,5,12,5,12,5]
22610 ; AVX512BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
22611 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1
22612 ; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm31, %zmm1
22613 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22614 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6]
22615 ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
22616 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1
22617 ; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1
22618 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22619 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
22620 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
22621 ; AVX512BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm0
22622 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22623 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
22624 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
22625 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0
22626 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22627 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
22628 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm27, %zmm0
22629 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22630 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
22631 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm31, %zmm0
22632 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22633 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
22634 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm0
22635 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22636 ; AVX512BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm28
22637 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22638 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
22639 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm0
22640 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28
22641 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
22642 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm27, %zmm0
22643 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22644 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
22645 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm31, %zmm0
22646 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22647 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
22648 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm0
22649 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22650 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm26
22651 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22652 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm9
22653 ; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm9
22654 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0
22655 ; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm27, %zmm0
22656 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22657 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0
22658 ; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm31, %zmm0
22659 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17
22660 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0
22661 ; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm0
22662 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16
22663 ; AVX512BW-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm29
22664 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22665 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
22666 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0
22667 ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0
22668 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19
22669 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm10
22670 ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm10
22671 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm13
22672 ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm13
22673 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0
22674 ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm0
22675 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22676 ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm21
22677 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22678 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
22679 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2
22680 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm2
22681 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm22
22682 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4
22683 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm4
22684 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3
22685 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm3
22686 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm20
22687 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7
22688 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm12, %zmm7
22689 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm5
22690 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
22691 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3
22692 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
22693 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm3
22694 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm29
22695 ; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm8, %zmm23
22696 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22697 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload
22698 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22699 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2
22700 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm2
22701 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
22702 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
22703 ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm6
22704 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22705 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
22706 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
22707 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22708 ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
22709 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
22710 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22711 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
22712 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
22713 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22714 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
22715 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
22716 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22717 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
22718 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
22719 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm8
22720 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22721 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
22722 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm8 # 64-byte Folded Reload
22723 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22724 ; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm27
22725 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
22726 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm0
22727 ; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm31
22728 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8
22729 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm12, %zmm8
22730 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm18
22731 ; AVX512BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm12
22732 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm29
22733 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm26
22734 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
22735 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
22736 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
22737 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
22738 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm11
22739 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22740 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
22741 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm25
22742 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22743 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
22744 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm9
22745 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22746 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
22747 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm28
22748 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22749 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
22750 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm22
22751 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22752 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
22753 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm19
22754 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22755 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
22756 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm3
22757 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22758 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm23
22759 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
22760 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
22761 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm2
22762 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22763 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
22764 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28
22765 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22766 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
22767 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm2
22768 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22769 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
22770 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm9
22771 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22772 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
22773 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm9
22774 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22775 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm10
22776 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22777 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm4
22778 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22779 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm27
22780 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
22781 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
22782 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm0
22783 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22784 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
22785 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28
22786 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22787 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
22788 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm28
22789 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22790 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
22791 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm9
22792 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22793 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm17
22794 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22795 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm13
22796 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22797 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm20
22798 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22799 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm31
22800 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
22801 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
22802 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm18
22803 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22804 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
22805 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28
22806 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22807 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
22808 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm28
22809 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22810 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
22811 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm28
22812 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22813 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm16
22814 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22815 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
22816 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm9
22817 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22818 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm7
22819 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22820 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm12
22821 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
22822 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
22823 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
22824 ; AVX512BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm16
22825 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
22826 ; AVX512BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm17
22827 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
22828 ; AVX512BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm18
22829 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
22830 ; AVX512BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm20
22831 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
22832 ; AVX512BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm21
22833 ; AVX512BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm5
22834 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22835 ; AVX512BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm29
22836 ; AVX512BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm26
22837 ; AVX512BW-FCP-NEXT: movb $24, %al
22838 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
22839 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22840 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
22841 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5]
22842 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [7,0,9,0,7,0,9,0]
22843 ; AVX512BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3]
22844 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0
22845 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm0
22846 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22847 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,11]
22848 ; AVX512BW-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm1
22849 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22850 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
22851 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22852 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
22853 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
22854 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
22855 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0
22856 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22857 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
22858 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22859 ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
22860 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22861 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
22862 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
22863 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
22864 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0
22865 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22866 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
22867 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
22868 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
22869 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22870 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
22871 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
22872 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
22873 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0
22874 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22875 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
22876 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22877 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22878 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22879 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
22880 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5]
22881 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3
22882 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm3
22883 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22884 ; AVX512BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
22885 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22886 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22887 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
22888 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[4,5,4,5],zmm6[4,5,4,5]
22889 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3
22890 ; AVX512BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm3
22891 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22892 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
22893 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22894 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
22895 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
22896 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
22897 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm15[4,5,4,5],zmm6[4,5,4,5]
22898 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
22899 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm28
22900 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
22901 ; AVX512BW-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm28
22902 ; AVX512BW-FCP-NEXT: vpermi2q %zmm15, %zmm6, %zmm30
22903 ; AVX512BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm15
22904 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
22905 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5]
22906 ; AVX512BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm13
22907 ; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13]
22908 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
22909 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
22910 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
22911 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
22912 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22913 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
22914 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
22915 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
22916 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
22917 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
22918 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
22919 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
22920 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
22921 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload
22922 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
22923 ; AVX512BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload
22924 ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
22925 ; AVX512BW-FCP-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7]
22926 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0
22927 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22928 ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
22929 ; AVX512BW-FCP-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7]
22930 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17
22931 ; AVX512BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
22932 ; AVX512BW-FCP-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7]
22933 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0
22934 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22935 ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
22936 ; AVX512BW-FCP-NEXT: # ymm0 = ymm4[0,1,2,3],mem[4,5,6,7]
22937 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
22938 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
22939 ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
22940 ; AVX512BW-FCP-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7]
22941 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
22942 ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
22943 ; AVX512BW-FCP-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7]
22944 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22945 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
22946 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22947 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm15[4,5,6,7]
22948 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm29
22949 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5,6,7]
22950 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
22951 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22952 ; AVX512BW-FCP-NEXT: movb $-32, %al
22953 ; AVX512BW-FCP-NEXT: kmovd %eax, %k2
22954 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22955 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2}
22956 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22957 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm19
22958 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22959 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2}
22960 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22961 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2}
22962 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22963 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24
22964 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22965 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2}
22966 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22967 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2}
22968 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22969 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22970 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2}
22971 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22972 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22973 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2}
22974 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22975 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2}
22976 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm25
22977 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
22978 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22979 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1}
22980 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22981 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2}
22982 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22
22983 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22984 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22985 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
22986 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22987 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
22988 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22989 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
22990 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22991 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
22992 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22993 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
22994 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
22995 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
22996 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22997 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1}
22998 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
22999 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2}
23000 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
23001 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23002 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
23003 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23004 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2}
23005 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
23006 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23007 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1}
23008 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23009 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2}
23010 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
23011 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23012 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
23013 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23014 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2}
23015 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23016 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23017 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
23018 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2}
23019 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9
23020 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
23021 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23022 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2}
23023 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
23024 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23025 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2}
23026 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
23027 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23028 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2}
23029 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
23030 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23031 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2}
23032 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
23033 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23034 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2}
23035 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
23036 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23037 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2}
23038 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
23039 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23040 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2}
23041 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
23042 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm27 {%k2}
23043 ; AVX512BW-FCP-NEXT: vmovdqa 2752(%rdi), %ymm0
23044 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
23045 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
23046 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm28, %zmm1
23047 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23048 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
23049 ; AVX512BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm0
23050 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
23051 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
23052 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
23053 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0
23054 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
23055 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2}
23056 ; AVX512BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm3
23057 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
23058 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
23059 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
23060 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm4, %zmm3
23061 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
23062 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2}
23063 ; AVX512BW-FCP-NEXT: vmovdqa 1408(%rdi), %ymm4
23064 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
23065 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
23066 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
23067 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm5, %zmm4
23068 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
23069 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2}
23070 ; AVX512BW-FCP-NEXT: vmovdqa 960(%rdi), %ymm5
23071 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
23072 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
23073 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
23074 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5
23075 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
23076 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 {%k2}
23077 ; AVX512BW-FCP-NEXT: vmovdqa 2304(%rdi), %ymm6
23078 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
23079 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
23080 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
23081 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm7, %zmm6
23082 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
23083 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2}
23084 ; AVX512BW-FCP-NEXT: vmovdqa 1856(%rdi), %ymm7
23085 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
23086 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
23087 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
23088 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm7, %zmm8, %zmm7
23089 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
23090 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2}
23091 ; AVX512BW-FCP-NEXT: vmovdqa 3200(%rdi), %ymm8
23092 ; AVX512BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
23093 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
23094 ; AVX512BW-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm30, %zmm8
23095 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 {%k2}
23096 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, 448(%rsi)
23097 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
23098 ; AVX512BW-FCP-NEXT: vmovaps %zmm12, 384(%rsi)
23099 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
23100 ; AVX512BW-FCP-NEXT: vmovaps %zmm12, 320(%rsi)
23101 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
23102 ; AVX512BW-FCP-NEXT: vmovaps %zmm12, 256(%rsi)
23103 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 192(%rsi)
23104 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
23105 ; AVX512BW-FCP-NEXT: vmovaps %zmm12, 128(%rsi)
23106 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rsi)
23107 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
23108 ; AVX512BW-FCP-NEXT: vmovaps %zmm12, (%rsi)
23109 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 448(%rdx)
23110 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 256(%rdx)
23111 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 320(%rdx)
23112 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rdx)
23113 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 192(%rdx)
23114 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
23115 ; AVX512BW-FCP-NEXT: vmovaps %zmm2, (%rdx)
23116 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
23117 ; AVX512BW-FCP-NEXT: vmovaps %zmm2, 64(%rdx)
23118 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 384(%rdx)
23119 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, 448(%rcx)
23120 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 256(%rcx)
23121 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, 320(%rcx)
23122 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 128(%rcx)
23123 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rcx)
23124 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx)
23125 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rcx)
23126 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 384(%rcx)
23127 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 448(%r8)
23128 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, 256(%r8)
23129 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 320(%r8)
23130 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8)
23131 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8)
23132 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%r8)
23133 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8)
23134 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 384(%r8)
23135 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23136 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%r9)
23137 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23138 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 256(%r9)
23139 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23140 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 320(%r9)
23141 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23142 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%r9)
23143 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23144 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%r9)
23145 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23146 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%r9)
23147 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23148 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%r9)
23149 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23150 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 384(%r9)
23151 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
23152 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23153 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 448(%rax)
23154 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23155 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 256(%rax)
23156 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23157 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 320(%rax)
23158 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23159 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rax)
23160 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23161 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%rax)
23162 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23163 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, (%rax)
23164 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23165 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%rax)
23166 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23167 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 384(%rax)
23168 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
23169 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23170 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 384(%rax)
23171 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, 448(%rax)
23172 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23173 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 256(%rax)
23174 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 320(%rax)
23175 ; AVX512BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
23176 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 128(%rax)
23177 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23178 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 192(%rax)
23179 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%rax)
23180 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23181 ; AVX512BW-FCP-NEXT: vmovaps %zmm0, 64(%rax)
23182 ; AVX512BW-FCP-NEXT: addq $7624, %rsp # imm = 0x1DC8
23183 ; AVX512BW-FCP-NEXT: vzeroupper
23184 ; AVX512BW-FCP-NEXT: retq
23186 ; AVX512DQ-BW-LABEL: load_i64_stride7_vf64:
23187 ; AVX512DQ-BW: # %bb.0:
23188 ; AVX512DQ-BW-NEXT: subq $7624, %rsp # imm = 0x1DC8
23189 ; AVX512DQ-BW-NEXT: vmovdqa64 3328(%rdi), %zmm4
23190 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23191 ; AVX512DQ-BW-NEXT: vmovdqa64 3264(%rdi), %zmm6
23192 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23193 ; AVX512DQ-BW-NEXT: vmovdqa64 3008(%rdi), %zmm25
23194 ; AVX512DQ-BW-NEXT: vmovdqa64 2944(%rdi), %zmm18
23195 ; AVX512DQ-BW-NEXT: vmovdqa64 2880(%rdi), %zmm2
23196 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23197 ; AVX512DQ-BW-NEXT: vmovdqa64 2816(%rdi), %zmm1
23198 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23199 ; AVX512DQ-BW-NEXT: vmovdqa64 2752(%rdi), %zmm30
23200 ; AVX512DQ-BW-NEXT: vmovdqa64 2688(%rdi), %zmm5
23201 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23202 ; AVX512DQ-BW-NEXT: vmovdqa64 2432(%rdi), %zmm19
23203 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23204 ; AVX512DQ-BW-NEXT: vmovdqa64 2368(%rdi), %zmm8
23205 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23206 ; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm9
23207 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23208 ; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm3
23209 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23210 ; AVX512DQ-BW-NEXT: vmovdqa64 1536(%rdi), %zmm10
23211 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23212 ; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm12
23213 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill
23214 ; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm11
23215 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23216 ; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm13
23217 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23218 ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm16
23219 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23220 ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm14
23221 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23222 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm17
23223 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23224 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm15
23225 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23226 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
23227 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
23228 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
23229 ; AVX512DQ-BW-NEXT: vmovdqa 2704(%rdi), %xmm2
23230 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
23231 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
23232 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23233 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1
23234 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
23235 ; AVX512DQ-BW-NEXT: vmovdqa 464(%rdi), %xmm2
23236 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
23237 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
23238 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23239 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm1
23240 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
23241 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm2
23242 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
23243 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
23244 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23245 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm1
23246 ; AVX512DQ-BW-NEXT: vpermt2q %zmm10, %zmm0, %zmm1
23247 ; AVX512DQ-BW-NEXT: vmovdqa 1360(%rdi), %xmm2
23248 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
23249 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
23250 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23251 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1
23252 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm0, %zmm1
23253 ; AVX512DQ-BW-NEXT: vmovdqa 912(%rdi), %xmm2
23254 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
23255 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
23256 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23257 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1
23258 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm1
23259 ; AVX512DQ-BW-NEXT: vmovdqa 2256(%rdi), %xmm2
23260 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
23261 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
23262 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23263 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1
23264 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm0, %zmm1
23265 ; AVX512DQ-BW-NEXT: vmovdqa 1808(%rdi), %xmm2
23266 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
23267 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
23268 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23269 ; AVX512DQ-BW-NEXT: vpermi2q %zmm4, %zmm6, %zmm0
23270 ; AVX512DQ-BW-NEXT: vmovdqa 3152(%rdi), %xmm1
23271 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
23272 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
23273 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23274 ; AVX512DQ-BW-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm0
23275 ; AVX512DQ-BW-NEXT: vmovdqa 2816(%rdi), %ymm1
23276 ; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
23277 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
23278 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
23279 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1
23280 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm9, %zmm1
23281 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
23282 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0]
23283 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
23284 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm16
23285 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23286 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm1
23287 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23288 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm10, %zmm1
23289 ; AVX512DQ-BW-NEXT: vmovdqa64 3072(%rdi), %zmm2
23290 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23291 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
23292 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
23293 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
23294 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
23295 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23296 ; AVX512DQ-BW-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0
23297 ; AVX512DQ-BW-NEXT: vmovdqa 576(%rdi), %ymm1
23298 ; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
23299 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
23300 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm27
23301 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm2
23302 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1
23303 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12
23304 ; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm9, %zmm1
23305 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
23306 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm28
23307 ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm1
23308 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23309 ; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm10, %zmm1
23310 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm2
23311 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23312 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
23313 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
23314 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23315 ; AVX512DQ-BW-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0
23316 ; AVX512DQ-BW-NEXT: vmovdqa 128(%rdi), %ymm1
23317 ; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
23318 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
23319 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm31
23320 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm8
23321 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1
23322 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm9, %zmm1
23323 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
23324 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm18
23325 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm29
23326 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm1
23327 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23328 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm10, %zmm1
23329 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm2
23330 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23331 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
23332 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
23333 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23334 ; AVX512DQ-BW-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0
23335 ; AVX512DQ-BW-NEXT: vmovdqa 1472(%rdi), %ymm1
23336 ; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
23337 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
23338 ; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm26
23339 ; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm7
23340 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1
23341 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm9, %zmm1
23342 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
23343 ; AVX512DQ-BW-NEXT: vmovdqa64 1664(%rdi), %zmm13
23344 ; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm1
23345 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23346 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm10, %zmm1
23347 ; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm2
23348 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23349 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
23350 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
23351 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23352 ; AVX512DQ-BW-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
23353 ; AVX512DQ-BW-NEXT: vmovdqa 1024(%rdi), %ymm1
23354 ; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
23355 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
23356 ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm19
23357 ; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm6
23358 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1
23359 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23360 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm9, %zmm1
23361 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
23362 ; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm22
23363 ; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm1
23364 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23365 ; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm10, %zmm1
23366 ; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm2
23367 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23368 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
23369 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
23370 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23371 ; AVX512DQ-BW-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm0
23372 ; AVX512DQ-BW-NEXT: vmovdqa 2368(%rdi), %ymm1
23373 ; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
23374 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
23375 ; AVX512DQ-BW-NEXT: vmovdqa64 2304(%rdi), %zmm20
23376 ; AVX512DQ-BW-NEXT: vmovdqa64 2240(%rdi), %zmm5
23377 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1
23378 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23379 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm9, %zmm1
23380 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
23381 ; AVX512DQ-BW-NEXT: vmovdqa64 2560(%rdi), %zmm15
23382 ; AVX512DQ-BW-NEXT: vmovdqa64 2496(%rdi), %zmm1
23383 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23384 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm10, %zmm1
23385 ; AVX512DQ-BW-NEXT: vmovdqa64 2624(%rdi), %zmm3
23386 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23387 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm4, %zmm1
23388 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
23389 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23390 ; AVX512DQ-BW-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm0
23391 ; AVX512DQ-BW-NEXT: vmovdqa 1920(%rdi), %ymm1
23392 ; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
23393 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
23394 ; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm21
23395 ; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm2
23396 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1
23397 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23398 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm9, %zmm1
23399 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
23400 ; AVX512DQ-BW-NEXT: vmovdqa64 2112(%rdi), %zmm14
23401 ; AVX512DQ-BW-NEXT: vmovdqa64 2048(%rdi), %zmm1
23402 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23403 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm10, %zmm1
23404 ; AVX512DQ-BW-NEXT: vmovdqa64 2176(%rdi), %zmm23
23405 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm4, %zmm1
23406 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23407 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
23408 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23409 ; AVX512DQ-BW-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm0
23410 ; AVX512DQ-BW-NEXT: vmovdqa 3264(%rdi), %ymm1
23411 ; AVX512DQ-BW-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
23412 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
23413 ; AVX512DQ-BW-NEXT: vmovdqa64 3200(%rdi), %zmm17
23414 ; AVX512DQ-BW-NEXT: vmovdqa64 3136(%rdi), %zmm11
23415 ; AVX512DQ-BW-NEXT: vpermi2q %zmm17, %zmm11, %zmm9
23416 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23417 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
23418 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm9
23419 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm10, %zmm9
23420 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23421 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm1
23422 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23423 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm9
23424 ; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm10, %zmm9
23425 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23426 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm10, %zmm6
23427 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23428 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9
23429 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm10, %zmm9
23430 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23431 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm10, %zmm2
23432 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23433 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm10, %zmm5
23434 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23435 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
23436 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm9
23437 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm6
23438 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm9
23439 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23440 ; AVX512DQ-BW-NEXT: vpermt2q %zmm17, %zmm10, %zmm11
23441 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23442 ; AVX512DQ-BW-NEXT: vmovdqa64 3456(%rdi), %zmm24
23443 ; AVX512DQ-BW-NEXT: vmovdqa64 3392(%rdi), %zmm2
23444 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23445 ; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm2, %zmm10
23446 ; AVX512DQ-BW-NEXT: vmovdqa64 3520(%rdi), %zmm30
23447 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm4, %zmm10
23448 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
23449 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23450 ; AVX512DQ-BW-NEXT: vmovdqa 2880(%rdi), %ymm0
23451 ; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
23452 ; AVX512DQ-BW-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
23453 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12]
23454 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm9
23455 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm0, %zmm9
23456 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23457 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7]
23458 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0]
23459 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
23460 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm12
23461 ; AVX512DQ-BW-NEXT: vpermt2q %zmm16, %zmm4, %zmm12
23462 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14]
23463 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
23464 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
23465 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5
23466 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23467 ; AVX512DQ-BW-NEXT: vmovdqa 640(%rdi), %ymm9
23468 ; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
23469 ; AVX512DQ-BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
23470 ; AVX512DQ-BW-NEXT: vpermt2q %zmm27, %zmm0, %zmm1
23471 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23472 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7]
23473 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm12
23474 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
23475 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm4, %zmm12
23476 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
23477 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5
23478 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23479 ; AVX512DQ-BW-NEXT: vmovdqa 192(%rdi), %ymm9
23480 ; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
23481 ; AVX512DQ-BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
23482 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23483 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm12
23484 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23485 ; AVX512DQ-BW-NEXT: vpermt2q %zmm31, %zmm0, %zmm12
23486 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
23487 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm12
23488 ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm12
23489 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
23490 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5
23491 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23492 ; AVX512DQ-BW-NEXT: vmovdqa 1536(%rdi), %ymm9
23493 ; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
23494 ; AVX512DQ-BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
23495 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm5
23496 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23497 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm12
23498 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm16
23499 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23500 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm0, %zmm12
23501 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
23502 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm12
23503 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
23504 ; AVX512DQ-BW-NEXT: vpermt2q %zmm26, %zmm4, %zmm12
23505 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
23506 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm7
23507 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23508 ; AVX512DQ-BW-NEXT: vmovdqa 1088(%rdi), %ymm9
23509 ; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
23510 ; AVX512DQ-BW-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
23511 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
23512 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm12
23513 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23514 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm0, %zmm12
23515 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
23516 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm12
23517 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
23518 ; AVX512DQ-BW-NEXT: vpermt2q %zmm29, %zmm4, %zmm12
23519 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
23520 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9
23521 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23522 ; AVX512DQ-BW-NEXT: vmovdqa 2432(%rdi), %ymm9
23523 ; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload
23524 ; AVX512DQ-BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
23525 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
23526 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm9
23527 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23528 ; AVX512DQ-BW-NEXT: vpermt2q %zmm20, %zmm0, %zmm9
23529 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
23530 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm9
23531 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm9 # 64-byte Folded Reload
23532 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm9 # 64-byte Folded Reload
23533 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm9, %zmm2
23534 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23535 ; AVX512DQ-BW-NEXT: vmovdqa 1984(%rdi), %ymm2
23536 ; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
23537 ; AVX512DQ-BW-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
23538 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
23539 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm3
23540 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23541 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm0, %zmm3
23542 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
23543 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm3
23544 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload
23545 ; AVX512DQ-BW-NEXT: vpermt2q %zmm23, %zmm10, %zmm3
23546 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
23547 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23548 ; AVX512DQ-BW-NEXT: vmovdqa 3328(%rdi), %ymm2
23549 ; AVX512DQ-BW-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload
23550 ; AVX512DQ-BW-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
23551 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
23552 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23553 ; AVX512DQ-BW-NEXT: vpermi2q %zmm17, %zmm2, %zmm0
23554 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
23555 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm6 # 64-byte Folded Reload
23556 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23557 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm1
23558 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
23559 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23560 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm1
23561 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm4, %zmm1
23562 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23563 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1
23564 ; AVX512DQ-BW-NEXT: vpermt2q %zmm5, %zmm4, %zmm1
23565 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23566 ; AVX512DQ-BW-NEXT: vpermt2q %zmm7, %zmm4, %zmm19
23567 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23568 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm4, %zmm20
23569 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23570 ; AVX512DQ-BW-NEXT: vpermt2q %zmm9, %zmm4, %zmm21
23571 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23572 ; AVX512DQ-BW-NEXT: vpermt2q %zmm2, %zmm4, %zmm17
23573 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23574 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
23575 ; AVX512DQ-BW-NEXT: vpermi2q %zmm8, %zmm24, %zmm4
23576 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm10, %zmm4
23577 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
23578 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23579 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3]
23580 ; AVX512DQ-BW-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
23581 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm0
23582 ; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm23, %zmm25
23583 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [11,4,11,4,11,4,11,4]
23584 ; AVX512DQ-BW-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
23585 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1
23586 ; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm27, %zmm1
23587 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23588 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [12,5,12,5,12,5,12,5]
23589 ; AVX512DQ-BW-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
23590 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1
23591 ; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm31, %zmm1
23592 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23593 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6]
23594 ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
23595 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1
23596 ; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm12, %zmm1
23597 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23598 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
23599 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
23600 ; AVX512DQ-BW-NEXT: vpermt2q %zmm28, %zmm1, %zmm0
23601 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23602 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
23603 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0
23604 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm23, %zmm0
23605 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23606 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0
23607 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm27, %zmm0
23608 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23609 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0
23610 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm31, %zmm0
23611 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23612 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm0
23613 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm12, %zmm0
23614 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23615 ; AVX512DQ-BW-NEXT: vpermt2q %zmm18, %zmm1, %zmm28
23616 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23617 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0
23618 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm23, %zmm0
23619 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm28
23620 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0
23621 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm27, %zmm0
23622 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23623 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0
23624 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm31, %zmm0
23625 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23626 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0
23627 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm12, %zmm0
23628 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23629 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm1, %zmm26
23630 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23631 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm9
23632 ; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm23, %zmm9
23633 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0
23634 ; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm27, %zmm0
23635 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23636 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0
23637 ; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm31, %zmm0
23638 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm17
23639 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm0
23640 ; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm12, %zmm0
23641 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16
23642 ; AVX512DQ-BW-NEXT: vpermt2q %zmm22, %zmm1, %zmm29
23643 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23644 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
23645 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm0
23646 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm23, %zmm0
23647 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19
23648 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm10
23649 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm27, %zmm10
23650 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm13
23651 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm31, %zmm13
23652 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm0
23653 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm0
23654 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23655 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm1, %zmm21
23656 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23657 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
23658 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm2
23659 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm2
23660 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm22
23661 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm4
23662 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm4
23663 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm3
23664 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm31, %zmm3
23665 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm20
23666 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm7
23667 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm7
23668 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm5
23669 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
23670 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm3
23671 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
23672 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm23, %zmm3
23673 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm29
23674 ; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm8, %zmm23
23675 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23676 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload
23677 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23678 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm2
23679 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm27, %zmm2
23680 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
23681 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
23682 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm12, %zmm6
23683 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23684 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
23685 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
23686 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23687 ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
23688 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
23689 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23690 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
23691 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
23692 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23693 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
23694 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
23695 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23696 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
23697 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
23698 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm12, %zmm8
23699 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23700 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
23701 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm8 # 64-byte Folded Reload
23702 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23703 ; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm27
23704 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm0
23705 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm31, %zmm0
23706 ; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm31
23707 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm8
23708 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm12, %zmm8
23709 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm18
23710 ; AVX512DQ-BW-NEXT: vpermi2q %zmm24, %zmm29, %zmm12
23711 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm29
23712 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm26
23713 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
23714 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
23715 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
23716 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
23717 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm11
23718 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23719 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
23720 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm25
23721 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23722 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
23723 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm9
23724 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23725 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
23726 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm28
23727 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23728 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
23729 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm22
23730 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23731 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
23732 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm19
23733 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23734 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
23735 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm3
23736 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23737 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm23
23738 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
23739 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
23740 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm2
23741 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23742 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
23743 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28
23744 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23745 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
23746 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm2
23747 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23748 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
23749 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm9
23750 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23751 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
23752 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm9
23753 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23754 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm10
23755 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23756 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm4
23757 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23758 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm27
23759 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
23760 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
23761 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm0
23762 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23763 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
23764 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28
23765 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23766 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
23767 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm28
23768 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23769 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
23770 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm9
23771 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23772 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm17
23773 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23774 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm13
23775 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23776 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm20
23777 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23778 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm31
23779 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
23780 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
23781 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm18
23782 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23783 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
23784 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm28
23785 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23786 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
23787 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm28
23788 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23789 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
23790 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm28
23791 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23792 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm16
23793 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23794 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
23795 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm9
23796 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23797 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm7
23798 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23799 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm12
23800 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
23801 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
23802 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
23803 ; AVX512DQ-BW-NEXT: vpermt2q %zmm21, %zmm1, %zmm16
23804 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
23805 ; AVX512DQ-BW-NEXT: vpermt2q %zmm8, %zmm1, %zmm17
23806 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
23807 ; AVX512DQ-BW-NEXT: vpermt2q %zmm14, %zmm1, %zmm18
23808 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
23809 ; AVX512DQ-BW-NEXT: vpermt2q %zmm24, %zmm1, %zmm20
23810 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
23811 ; AVX512DQ-BW-NEXT: vpermt2q %zmm11, %zmm1, %zmm21
23812 ; AVX512DQ-BW-NEXT: vpermt2q %zmm25, %zmm1, %zmm5
23813 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23814 ; AVX512DQ-BW-NEXT: vpermt2q %zmm30, %zmm1, %zmm29
23815 ; AVX512DQ-BW-NEXT: vpermt2q %zmm19, %zmm1, %zmm26
23816 ; AVX512DQ-BW-NEXT: movb $24, %al
23817 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
23818 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23819 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
23820 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5]
23821 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [7,0,9,0,7,0,9,0]
23822 ; AVX512DQ-BW-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3]
23823 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm0
23824 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm30, %zmm0
23825 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23826 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,11]
23827 ; AVX512DQ-BW-NEXT: vpermt2q %zmm15, %zmm2, %zmm1
23828 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23829 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
23830 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23831 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
23832 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
23833 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0
23834 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm0
23835 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23836 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
23837 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23838 ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
23839 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23840 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
23841 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
23842 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0
23843 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm0
23844 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23845 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
23846 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
23847 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
23848 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23849 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
23850 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
23851 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0
23852 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm30, %zmm0
23853 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23854 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
23855 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23856 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23857 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23858 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
23859 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5]
23860 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm3
23861 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm3
23862 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23863 ; AVX512DQ-BW-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
23864 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23865 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23866 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
23867 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[4,5,4,5],zmm6[4,5,4,5]
23868 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3
23869 ; AVX512DQ-BW-NEXT: vpermt2q %zmm0, %zmm30, %zmm3
23870 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23871 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
23872 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23873 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
23874 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
23875 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
23876 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm15[4,5,4,5],zmm6[4,5,4,5]
23877 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
23878 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm28
23879 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
23880 ; AVX512DQ-BW-NEXT: vpermt2q %zmm13, %zmm30, %zmm28
23881 ; AVX512DQ-BW-NEXT: vpermi2q %zmm15, %zmm6, %zmm30
23882 ; AVX512DQ-BW-NEXT: vpermt2q %zmm6, %zmm2, %zmm15
23883 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
23884 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5]
23885 ; AVX512DQ-BW-NEXT: vpermt2q %zmm3, %zmm2, %zmm13
23886 ; AVX512DQ-BW-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13]
23887 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
23888 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
23889 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
23890 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
23891 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23892 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
23893 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
23894 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
23895 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
23896 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
23897 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
23898 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
23899 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
23900 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload
23901 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
23902 ; AVX512DQ-BW-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload
23903 ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
23904 ; AVX512DQ-BW-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7]
23905 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0
23906 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23907 ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
23908 ; AVX512DQ-BW-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7]
23909 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17
23910 ; AVX512DQ-BW-NEXT: vpblendd $240, (%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
23911 ; AVX512DQ-BW-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7]
23912 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0
23913 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23914 ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
23915 ; AVX512DQ-BW-NEXT: # ymm0 = ymm4[0,1,2,3],mem[4,5,6,7]
23916 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
23917 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
23918 ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
23919 ; AVX512DQ-BW-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7]
23920 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
23921 ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
23922 ; AVX512DQ-BW-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7]
23923 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23924 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
23925 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23926 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm15[4,5,6,7]
23927 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm29
23928 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5,6,7]
23929 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
23930 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23931 ; AVX512DQ-BW-NEXT: movb $-32, %al
23932 ; AVX512DQ-BW-NEXT: kmovd %eax, %k2
23933 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23934 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2}
23935 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23936 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm19
23937 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23938 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2}
23939 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23940 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2}
23941 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23942 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm24
23943 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23944 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2}
23945 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23946 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2}
23947 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23948 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23949 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2}
23950 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23951 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23952 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2}
23953 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23954 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2}
23955 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm25
23956 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
23957 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23958 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1}
23959 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23960 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2}
23961 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm22
23962 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23963 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23964 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
23965 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23966 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
23967 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23968 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23969 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23970 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
23971 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23972 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
23973 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
23974 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
23975 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23976 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1}
23977 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23978 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2}
23979 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
23980 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23981 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
23982 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23983 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2}
23984 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
23985 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23986 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1}
23987 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23988 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2}
23989 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
23990 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23991 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
23992 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23993 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2}
23994 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
23995 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
23996 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
23997 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2}
23998 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm9
23999 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
24000 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24001 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2}
24002 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
24003 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24004 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2}
24005 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
24006 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24007 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2}
24008 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
24009 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24010 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2}
24011 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
24012 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24013 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2}
24014 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
24015 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24016 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2}
24017 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
24018 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24019 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2}
24020 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
24021 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm27 {%k2}
24022 ; AVX512DQ-BW-NEXT: vmovdqa 2752(%rdi), %ymm0
24023 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
24024 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm0
24025 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm0, %zmm28, %zmm1
24026 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24027 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
24028 ; AVX512DQ-BW-NEXT: vmovdqa 512(%rdi), %ymm0
24029 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
24030 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm0, %xmm0
24031 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
24032 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0
24033 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
24034 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2}
24035 ; AVX512DQ-BW-NEXT: vmovdqa 64(%rdi), %ymm3
24036 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
24037 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm3, %xmm3
24038 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
24039 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm3, %zmm4, %zmm3
24040 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
24041 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2}
24042 ; AVX512DQ-BW-NEXT: vmovdqa 1408(%rdi), %ymm4
24043 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
24044 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm4, %xmm4
24045 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
24046 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm4, %zmm5, %zmm4
24047 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
24048 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2}
24049 ; AVX512DQ-BW-NEXT: vmovdqa 960(%rdi), %ymm5
24050 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
24051 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm5, %xmm5
24052 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
24053 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5
24054 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
24055 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm5 {%k2}
24056 ; AVX512DQ-BW-NEXT: vmovdqa 2304(%rdi), %ymm6
24057 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
24058 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm6, %xmm6
24059 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
24060 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm6, %zmm7, %zmm6
24061 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
24062 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2}
24063 ; AVX512DQ-BW-NEXT: vmovdqa 1856(%rdi), %ymm7
24064 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
24065 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm7, %xmm7
24066 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
24067 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm7, %zmm8, %zmm7
24068 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
24069 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2}
24070 ; AVX512DQ-BW-NEXT: vmovdqa 3200(%rdi), %ymm8
24071 ; AVX512DQ-BW-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
24072 ; AVX512DQ-BW-NEXT: vextracti128 $1, %ymm8, %xmm8
24073 ; AVX512DQ-BW-NEXT: vinserti32x4 $0, %xmm8, %zmm30, %zmm8
24074 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm8 {%k2}
24075 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, 448(%rsi)
24076 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
24077 ; AVX512DQ-BW-NEXT: vmovaps %zmm12, 384(%rsi)
24078 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
24079 ; AVX512DQ-BW-NEXT: vmovaps %zmm12, 320(%rsi)
24080 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
24081 ; AVX512DQ-BW-NEXT: vmovaps %zmm12, 256(%rsi)
24082 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 192(%rsi)
24083 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
24084 ; AVX512DQ-BW-NEXT: vmovaps %zmm12, 128(%rsi)
24085 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, 64(%rsi)
24086 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
24087 ; AVX512DQ-BW-NEXT: vmovaps %zmm12, (%rsi)
24088 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 448(%rdx)
24089 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 256(%rdx)
24090 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 320(%rdx)
24091 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 128(%rdx)
24092 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 192(%rdx)
24093 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
24094 ; AVX512DQ-BW-NEXT: vmovaps %zmm2, (%rdx)
24095 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
24096 ; AVX512DQ-BW-NEXT: vmovaps %zmm2, 64(%rdx)
24097 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 384(%rdx)
24098 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, 448(%rcx)
24099 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 256(%rcx)
24100 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 320(%rcx)
24101 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 128(%rcx)
24102 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 192(%rcx)
24103 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, (%rcx)
24104 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 64(%rcx)
24105 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 384(%rcx)
24106 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 448(%r8)
24107 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 256(%r8)
24108 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 320(%r8)
24109 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 128(%r8)
24110 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 192(%r8)
24111 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%r8)
24112 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%r8)
24113 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, 384(%r8)
24114 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24115 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 448(%r9)
24116 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24117 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 256(%r9)
24118 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24119 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 320(%r9)
24120 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24121 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%r9)
24122 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24123 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%r9)
24124 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24125 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%r9)
24126 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24127 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%r9)
24128 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24129 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 384(%r9)
24130 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
24131 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24132 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 448(%rax)
24133 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24134 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 256(%rax)
24135 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24136 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 320(%rax)
24137 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24138 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%rax)
24139 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24140 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%rax)
24141 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24142 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, (%rax)
24143 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24144 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%rax)
24145 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24146 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 384(%rax)
24147 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
24148 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24149 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 384(%rax)
24150 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, 448(%rax)
24151 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24152 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 256(%rax)
24153 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, 320(%rax)
24154 ; AVX512DQ-BW-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
24155 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 128(%rax)
24156 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24157 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 192(%rax)
24158 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%rax)
24159 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24160 ; AVX512DQ-BW-NEXT: vmovaps %zmm0, 64(%rax)
24161 ; AVX512DQ-BW-NEXT: addq $7624, %rsp # imm = 0x1DC8
24162 ; AVX512DQ-BW-NEXT: vzeroupper
24163 ; AVX512DQ-BW-NEXT: retq
24165 ; AVX512DQ-BW-FCP-LABEL: load_i64_stride7_vf64:
24166 ; AVX512DQ-BW-FCP: # %bb.0:
24167 ; AVX512DQ-BW-FCP-NEXT: subq $7624, %rsp # imm = 0x1DC8
24168 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3328(%rdi), %zmm4
24169 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24170 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3264(%rdi), %zmm6
24171 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24172 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3008(%rdi), %zmm25
24173 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2944(%rdi), %zmm18
24174 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2880(%rdi), %zmm2
24175 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24176 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2816(%rdi), %zmm1
24177 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24178 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2752(%rdi), %zmm30
24179 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2688(%rdi), %zmm5
24180 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24181 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2432(%rdi), %zmm19
24182 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24183 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2368(%rdi), %zmm8
24184 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24185 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm9
24186 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24187 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm3
24188 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24189 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1536(%rdi), %zmm10
24190 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24191 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm12
24192 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, (%rsp) # 64-byte Spill
24193 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm11
24194 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24195 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm13
24196 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24197 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm16
24198 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24199 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm14
24200 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24201 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm17
24202 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24203 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm15
24204 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24205 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = [14,0,0,7,14,0,0,7]
24206 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3]
24207 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm0, %zmm1
24208 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2704(%rdi), %xmm2
24209 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
24210 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
24211 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24212 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1
24213 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm0, %zmm1
24214 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 464(%rdi), %xmm2
24215 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
24216 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
24217 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24218 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1
24219 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm0, %zmm1
24220 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
24221 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
24222 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
24223 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24224 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1
24225 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm10, %zmm0, %zmm1
24226 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1360(%rdi), %xmm2
24227 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
24228 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
24229 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24230 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1
24231 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm0, %zmm1
24232 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 912(%rdi), %xmm2
24233 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
24234 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
24235 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24236 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
24237 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm1
24238 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2256(%rdi), %xmm2
24239 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
24240 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
24241 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24242 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
24243 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm0, %zmm1
24244 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1808(%rdi), %xmm2
24245 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],mem[2,3]
24246 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm2, %zmm1, %zmm1
24247 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24248 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm4, %zmm6, %zmm0
24249 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3152(%rdi), %xmm1
24250 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],mem[2,3]
24251 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
24252 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24253 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 2880(%rdi), %ymm0, %ymm0
24254 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2816(%rdi), %ymm1
24255 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
24256 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
24257 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm9 = [4,11]
24258 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
24259 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm9, %zmm1
24260 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
24261 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [0,7,14,0,0,7,14,0]
24262 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
24263 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm16
24264 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24265 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm1
24266 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24267 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm10, %zmm1
24268 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3072(%rdi), %zmm2
24269 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24270 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [4,5,6,13,4,5,6,13]
24271 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
24272 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
24273 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
24274 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24275 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 640(%rdi), %ymm0, %ymm0
24276 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 576(%rdi), %ymm1
24277 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
24278 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
24279 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm27
24280 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm2
24281 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1
24282 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
24283 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm9, %zmm1
24284 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
24285 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28
24286 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm1
24287 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24288 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm10, %zmm1
24289 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm2
24290 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24291 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
24292 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
24293 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24294 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 192(%rdi), %ymm0, %ymm0
24295 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm1
24296 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
24297 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
24298 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm31
24299 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm8
24300 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
24301 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm9, %zmm1
24302 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
24303 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm18
24304 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm29
24305 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm1
24306 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24307 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm10, %zmm1
24308 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm2
24309 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24310 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
24311 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
24312 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24313 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1536(%rdi), %ymm0, %ymm0
24314 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1472(%rdi), %ymm1
24315 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
24316 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
24317 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm26
24318 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm7
24319 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1
24320 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm9, %zmm1
24321 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
24322 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1664(%rdi), %zmm13
24323 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm1
24324 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24325 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm10, %zmm1
24326 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm2
24327 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24328 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
24329 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
24330 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24331 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1088(%rdi), %ymm0, %ymm0
24332 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1024(%rdi), %ymm1
24333 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
24334 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
24335 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm19
24336 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm6
24337 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1
24338 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24339 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm9, %zmm1
24340 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
24341 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm22
24342 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm1
24343 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24344 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm10, %zmm1
24345 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm2
24346 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24347 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm1
24348 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
24349 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24350 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 2432(%rdi), %ymm0, %ymm0
24351 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2368(%rdi), %ymm1
24352 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
24353 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
24354 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2304(%rdi), %zmm20
24355 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2240(%rdi), %zmm5
24356 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
24357 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24358 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm9, %zmm1
24359 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
24360 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2560(%rdi), %zmm15
24361 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2496(%rdi), %zmm1
24362 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24363 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm10, %zmm1
24364 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2624(%rdi), %zmm3
24365 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24366 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm4, %zmm1
24367 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
24368 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24369 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 1984(%rdi), %ymm0, %ymm0
24370 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1920(%rdi), %ymm1
24371 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
24372 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
24373 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21
24374 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm2
24375 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1
24376 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24377 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm9, %zmm1
24378 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
24379 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2112(%rdi), %zmm14
24380 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2048(%rdi), %zmm1
24381 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24382 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm10, %zmm1
24383 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 2176(%rdi), %zmm23
24384 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm4, %zmm1
24385 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24386 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
24387 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24388 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, 3328(%rdi), %ymm0, %ymm0
24389 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3264(%rdi), %ymm1
24390 ; AVX512DQ-BW-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
24391 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
24392 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3200(%rdi), %zmm17
24393 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3136(%rdi), %zmm11
24394 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm17, %zmm11, %zmm9
24395 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24396 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm0[4,5,6,7]
24397 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm9
24398 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm10, %zmm9
24399 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24400 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1
24401 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24402 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm9
24403 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm10, %zmm9
24404 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24405 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm10, %zmm6
24406 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24407 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9
24408 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm10, %zmm9
24409 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24410 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm10, %zmm2
24411 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24412 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm10, %zmm5
24413 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24414 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
24415 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9
24416 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm6
24417 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm9
24418 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24419 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm17, %zmm10, %zmm11
24420 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24421 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3456(%rdi), %zmm24
24422 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3392(%rdi), %zmm2
24423 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24424 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm2, %zmm10
24425 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 3520(%rdi), %zmm30
24426 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm4, %zmm10
24427 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm10, %zmm0
24428 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24429 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2880(%rdi), %ymm0
24430 ; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
24431 ; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
24432 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [5,12]
24433 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm9
24434 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm0, %zmm9
24435 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24436 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm4[4,5,6,7]
24437 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [9,0,7,0,9,0,7,0]
24438 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
24439 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm12
24440 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm16, %zmm4, %zmm12
24441 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [4,5,6,14,4,5,6,14]
24442 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
24443 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
24444 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5
24445 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24446 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 640(%rdi), %ymm9
24447 ; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
24448 ; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
24449 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm27, %zmm0, %zmm1
24450 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24451 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm1[0,1,2,3],ymm9[4,5,6,7]
24452 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm12
24453 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
24454 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm4, %zmm12
24455 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
24456 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5
24457 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24458 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm9
24459 ; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
24460 ; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
24461 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24462 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm12
24463 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24464 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm31, %zmm0, %zmm12
24465 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
24466 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm12
24467 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm12
24468 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
24469 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm5
24470 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24471 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1536(%rdi), %ymm9
24472 ; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
24473 ; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
24474 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm5
24475 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24476 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12
24477 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm16
24478 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24479 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm0, %zmm12
24480 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
24481 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm12
24482 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
24483 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm26, %zmm4, %zmm12
24484 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
24485 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm7
24486 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24487 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1088(%rdi), %ymm9
24488 ; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm9 # 32-byte Folded Reload
24489 ; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
24490 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
24491 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12
24492 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24493 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm0, %zmm12
24494 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
24495 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm12
24496 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 # 64-byte Reload
24497 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm29, %zmm4, %zmm12
24498 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm12 # 64-byte Folded Reload
24499 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm12, %zmm9
24500 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24501 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2432(%rdi), %ymm9
24502 ; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm2 # 32-byte Folded Reload
24503 ; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20,21,22,23]
24504 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
24505 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm9
24506 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24507 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm20, %zmm0, %zmm9
24508 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
24509 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9
24510 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm9 # 64-byte Folded Reload
24511 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm9 # 64-byte Folded Reload
24512 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm9, %zmm2
24513 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24514 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1984(%rdi), %ymm2
24515 ; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
24516 ; AVX512DQ-BW-FCP-NEXT: # ymm2 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
24517 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
24518 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm3
24519 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24520 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm0, %zmm3
24521 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
24522 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm3
24523 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm3 # 64-byte Folded Reload
24524 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm23, %zmm10, %zmm3
24525 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
24526 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24527 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3328(%rdi), %ymm2
24528 ; AVX512DQ-BW-FCP-NEXT: vpalignr $8, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm1 # 32-byte Folded Reload
24529 ; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23]
24530 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
24531 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24532 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm17, %zmm2, %zmm0
24533 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
24534 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm6 # 64-byte Folded Reload
24535 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24536 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm1
24537 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm1 # 64-byte Folded Reload
24538 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24539 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm1
24540 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm4, %zmm1
24541 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24542 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1
24543 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm4, %zmm1
24544 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24545 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm7, %zmm4, %zmm19
24546 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24547 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm4, %zmm20
24548 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24549 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm9, %zmm4, %zmm21
24550 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24551 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm4, %zmm17
24552 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24553 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
24554 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm8, %zmm24, %zmm4
24555 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm10, %zmm4
24556 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
24557 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24558 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm23 = [10,3,10,3,10,3,10,3]
24559 ; AVX512DQ-BW-FCP-NEXT: # zmm23 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
24560 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm0
24561 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm23, %zmm25
24562 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm27 = [11,4,11,4,11,4,11,4]
24563 ; AVX512DQ-BW-FCP-NEXT: # zmm27 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
24564 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1
24565 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm27, %zmm1
24566 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24567 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm31 = [12,5,12,5,12,5,12,5]
24568 ; AVX512DQ-BW-FCP-NEXT: # zmm31 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
24569 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1
24570 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm31, %zmm1
24571 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24572 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [13,6,13,6,13,6,13,6]
24573 ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
24574 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1
24575 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm12, %zmm1
24576 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24577 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [2,9,2,9,2,9,2,9]
24578 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
24579 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm28, %zmm1, %zmm0
24580 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24581 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
24582 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
24583 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm23, %zmm0
24584 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24585 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
24586 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm27, %zmm0
24587 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24588 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
24589 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm31, %zmm0
24590 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24591 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm0
24592 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm12, %zmm0
24593 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24594 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm18, %zmm1, %zmm28
24595 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24596 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
24597 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm23, %zmm0
24598 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm28
24599 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
24600 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm27, %zmm0
24601 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24602 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
24603 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm31, %zmm0
24604 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24605 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
24606 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm12, %zmm0
24607 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24608 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm1, %zmm26
24609 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24610 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm9
24611 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm23, %zmm9
24612 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0
24613 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm27, %zmm0
24614 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24615 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0
24616 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm31, %zmm0
24617 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm17
24618 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm0
24619 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm12, %zmm0
24620 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16
24621 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm22, %zmm1, %zmm29
24622 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24623 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
24624 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0
24625 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm23, %zmm0
24626 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19
24627 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm10
24628 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm27, %zmm10
24629 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm13
24630 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm31, %zmm13
24631 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm0
24632 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm0
24633 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24634 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm1, %zmm21
24635 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24636 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
24637 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm2
24638 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm2
24639 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm22
24640 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4
24641 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm4
24642 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm3
24643 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm3
24644 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm20
24645 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm7
24646 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm12, %zmm7
24647 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm5
24648 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
24649 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm3
24650 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
24651 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm23, %zmm3
24652 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm29
24653 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm8, %zmm23
24654 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24655 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm0 # 64-byte Folded Reload
24656 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24657 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm2
24658 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm27, %zmm2
24659 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
24660 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
24661 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm12, %zmm6
24662 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24663 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
24664 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
24665 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24666 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
24667 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
24668 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24669 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
24670 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
24671 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24672 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
24673 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm6 # 64-byte Folded Reload
24674 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24675 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
24676 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
24677 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm12, %zmm8
24678 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24679 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
24680 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm12, %zmm8 # 64-byte Folded Reload
24681 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24682 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm27
24683 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm0
24684 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm31, %zmm0
24685 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm31
24686 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm8
24687 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm12, %zmm8
24688 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm18
24689 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm24, %zmm29, %zmm12
24690 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm29
24691 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm26
24692 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,9,0,5,6,9]
24693 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
24694 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
24695 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
24696 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm11
24697 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24698 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
24699 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm25
24700 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24701 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
24702 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm9
24703 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24704 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
24705 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm28
24706 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24707 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
24708 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm22
24709 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24710 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
24711 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm19
24712 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24713 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
24714 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm3
24715 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24716 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm23
24717 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,10,0,5,6,10]
24718 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
24719 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm2
24720 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24721 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
24722 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28
24723 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24724 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
24725 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm2
24726 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24727 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
24728 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm9
24729 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24730 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
24731 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm9
24732 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24733 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm10
24734 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24735 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm4
24736 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24737 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm27
24738 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,11,0,5,6,11]
24739 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
24740 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm0
24741 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24742 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
24743 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28
24744 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24745 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
24746 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm28
24747 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24748 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
24749 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm9
24750 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24751 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm17
24752 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24753 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm13
24754 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24755 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm20
24756 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24757 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm31
24758 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [0,5,6,12,0,5,6,12]
24759 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
24760 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm18
24761 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24762 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
24763 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm28
24764 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24765 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
24766 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm28
24767 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24768 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
24769 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm28
24770 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24771 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm16
24772 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24773 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
24774 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm9
24775 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24776 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm7
24777 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24778 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm12
24779 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [4,5,8,15,4,5,8,15]
24780 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3]
24781 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
24782 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm21, %zmm1, %zmm16
24783 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
24784 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm8, %zmm1, %zmm17
24785 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
24786 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm14, %zmm1, %zmm18
24787 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
24788 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm24, %zmm1, %zmm20
24789 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
24790 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm11, %zmm1, %zmm21
24791 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm25, %zmm1, %zmm5
24792 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24793 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm30, %zmm1, %zmm29
24794 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm19, %zmm1, %zmm26
24795 ; AVX512DQ-BW-FCP-NEXT: movb $24, %al
24796 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
24797 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24798 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 # 64-byte Reload
24799 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm22 {%k1} = zmm1[4,5,4,5],zmm15[4,5,4,5]
24800 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm30 = [7,0,9,0,7,0,9,0]
24801 ; AVX512DQ-BW-FCP-NEXT: # zmm30 = mem[0,1,2,3,0,1,2,3]
24802 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm0
24803 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm30, %zmm0
24804 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24805 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,11]
24806 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm15, %zmm2, %zmm1
24807 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24808 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
24809 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24810 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
24811 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm19 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
24812 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
24813 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0
24814 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24815 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
24816 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24817 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
24818 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24819 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
24820 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm14 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
24821 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
24822 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0
24823 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24824 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
24825 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, (%rsp) # 64-byte Spill
24826 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
24827 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24828 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
24829 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm3[4,5,4,5],zmm1[4,5,4,5]
24830 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0
24831 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm30, %zmm0
24832 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24833 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm3
24834 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24835 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24836 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24837 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
24838 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm0[4,5,4,5],zmm1[4,5,4,5]
24839 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm3
24840 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm3
24841 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24842 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm1, %zmm2, %zmm0
24843 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24844 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24845 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
24846 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 {%k1} = zmm0[4,5,4,5],zmm6[4,5,4,5]
24847 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3
24848 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm0, %zmm30, %zmm3
24849 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24850 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm0
24851 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24852 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
24853 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
24854 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
24855 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 {%k1} = zmm15[4,5,4,5],zmm6[4,5,4,5]
24856 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
24857 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm28
24858 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
24859 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm13, %zmm30, %zmm28
24860 ; AVX512DQ-BW-FCP-NEXT: vpermi2q %zmm15, %zmm6, %zmm30
24861 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm6, %zmm2, %zmm15
24862 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 # 64-byte Reload
24863 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm25 {%k1} = zmm13[4,5,4,5],zmm3[4,5,4,5]
24864 ; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm3, %zmm2, %zmm13
24865 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} xmm0 = [6,13]
24866 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
24867 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm2 # 64-byte Folded Reload
24868 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
24869 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm3 # 64-byte Folded Reload
24870 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24871 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
24872 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
24873 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm4 # 64-byte Folded Reload
24874 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
24875 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm5 # 64-byte Folded Reload
24876 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
24877 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm6 # 64-byte Folded Reload
24878 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
24879 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm8 # 64-byte Folded Reload
24880 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
24881 ; AVX512DQ-BW-FCP-NEXT: vpermt2q {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm7 # 64-byte Folded Reload
24882 ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm0 # 32-byte Folded Reload
24883 ; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm2[0,1,2,3],mem[4,5,6,7]
24884 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0
24885 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24886 ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm0 # 32-byte Folded Reload
24887 ; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm3[0,1,2,3],mem[4,5,6,7]
24888 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm17, %zmm17
24889 ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, (%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
24890 ; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm1[0,1,2,3],mem[4,5,6,7]
24891 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm18, %zmm0
24892 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24893 ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm0 # 32-byte Folded Reload
24894 ; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm4[0,1,2,3],mem[4,5,6,7]
24895 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
24896 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
24897 ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm0 # 32-byte Folded Reload
24898 ; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm5[0,1,2,3],mem[4,5,6,7]
24899 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
24900 ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
24901 ; AVX512DQ-BW-FCP-NEXT: # ymm0 = ymm6[0,1,2,3],mem[4,5,6,7]
24902 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24903 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
24904 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24905 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm15[4,5,6,7]
24906 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm29, %zmm29
24907 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm13[4,5,6,7]
24908 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm26, %zmm0
24909 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24910 ; AVX512DQ-BW-FCP-NEXT: movb $-32, %al
24911 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2
24912 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24913 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2}
24914 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24915 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm19
24916 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24917 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm19 {%k2}
24918 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24919 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2}
24920 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24921 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm24
24922 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24923 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm24 {%k2}
24924 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24925 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k2}
24926 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24927 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24928 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2}
24929 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24930 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24931 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm25 {%k2}
24932 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24933 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm9 {%k2}
24934 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm25
24935 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
24936 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24937 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1}
24938 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24939 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2}
24940 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm22
24941 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24942 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24943 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
24944 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24945 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
24946 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24947 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24948 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24949 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
24950 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24951 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
24952 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
24953 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
24954 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24955 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1}
24956 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24957 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k2}
24958 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
24959 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24960 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1}
24961 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24962 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm11 {%k2}
24963 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
24964 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24965 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k1}
24966 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24967 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10 {%k2}
24968 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
24969 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24970 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
24971 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24972 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k2}
24973 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24974 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
24975 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
24976 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm0 {%k2}
24977 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm9
24978 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
24979 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24980 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k2}
24981 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
24982 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24983 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm14 {%k2}
24984 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
24985 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24986 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm15 {%k2}
24987 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
24988 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24989 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm18 {%k2}
24990 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
24991 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24992 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm20 {%k2}
24993 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
24994 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24995 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm23 {%k2}
24996 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 # 64-byte Reload
24997 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
24998 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm26 {%k2}
24999 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
25000 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm27 {%k2}
25001 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2752(%rdi), %ymm0
25002 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
25003 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
25004 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm28, %zmm1
25005 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25006 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, %zmm1 {%k2}
25007 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 512(%rdi), %ymm0
25008 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm0 = mem[8,9,10,11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23]
25009 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm0, %xmm0
25010 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
25011 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm0, %zmm3, %zmm0
25012 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
25013 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k2}
25014 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 64(%rdi), %ymm3
25015 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm3 = mem[8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23]
25016 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3
25017 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
25018 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm3, %zmm4, %zmm3
25019 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
25020 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm3 {%k2}
25021 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1408(%rdi), %ymm4
25022 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm4 = mem[8,9,10,11,12,13,14,15],ymm4[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm4[16,17,18,19,20,21,22,23]
25023 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4
25024 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
25025 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm4, %zmm5, %zmm4
25026 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
25027 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4 {%k2}
25028 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 960(%rdi), %ymm5
25029 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm5 = mem[8,9,10,11,12,13,14,15],ymm5[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm5[16,17,18,19,20,21,22,23]
25030 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
25031 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
25032 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm5, %zmm6, %zmm5
25033 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
25034 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm5 {%k2}
25035 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 2304(%rdi), %ymm6
25036 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm6 = mem[8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20,21,22,23]
25037 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
25038 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
25039 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm6, %zmm7, %zmm6
25040 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
25041 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm6 {%k2}
25042 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 1856(%rdi), %ymm7
25043 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm7 = mem[8,9,10,11,12,13,14,15],ymm7[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm7[16,17,18,19,20,21,22,23]
25044 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm7
25045 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
25046 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm7, %zmm8, %zmm7
25047 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
25048 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm7 {%k2}
25049 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 3200(%rdi), %ymm8
25050 ; AVX512DQ-BW-FCP-NEXT: vpalignr {{.*#+}} ymm8 = mem[8,9,10,11,12,13,14,15],ymm8[0,1,2,3,4,5,6,7],mem[24,25,26,27,28,29,30,31],ymm8[16,17,18,19,20,21,22,23]
25051 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8
25052 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $0, %xmm8, %zmm30, %zmm8
25053 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm8 {%k2}
25054 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, 448(%rsi)
25055 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
25056 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm12, 384(%rsi)
25057 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
25058 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm12, 320(%rsi)
25059 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
25060 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm12, 256(%rsi)
25061 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 192(%rsi)
25062 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
25063 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm12, 128(%rsi)
25064 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 64(%rsi)
25065 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
25066 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm12, (%rsi)
25067 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 448(%rdx)
25068 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 256(%rdx)
25069 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 320(%rdx)
25070 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 128(%rdx)
25071 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 192(%rdx)
25072 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
25073 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, (%rdx)
25074 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
25075 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm2, 64(%rdx)
25076 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 384(%rdx)
25077 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, 448(%rcx)
25078 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 256(%rcx)
25079 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, 320(%rcx)
25080 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 128(%rcx)
25081 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 192(%rcx)
25082 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, (%rcx)
25083 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, 64(%rcx)
25084 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 384(%rcx)
25085 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 448(%r8)
25086 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, 256(%r8)
25087 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 320(%r8)
25088 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 128(%r8)
25089 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 192(%r8)
25090 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%r8)
25091 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%r8)
25092 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 384(%r8)
25093 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25094 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 448(%r9)
25095 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25096 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 256(%r9)
25097 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25098 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 320(%r9)
25099 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25100 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%r9)
25101 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25102 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%r9)
25103 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25104 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%r9)
25105 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25106 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%r9)
25107 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25108 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 384(%r9)
25109 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
25110 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25111 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 448(%rax)
25112 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25113 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 256(%rax)
25114 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25115 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 320(%rax)
25116 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25117 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rax)
25118 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25119 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%rax)
25120 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25121 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, (%rax)
25122 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25123 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%rax)
25124 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25125 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 384(%rax)
25126 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
25127 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25128 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 384(%rax)
25129 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, 448(%rax)
25130 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25131 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 256(%rax)
25132 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 320(%rax)
25133 ; AVX512DQ-BW-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload
25134 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 128(%rax)
25135 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25136 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 192(%rax)
25137 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%rax)
25138 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
25139 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm0, 64(%rax)
25140 ; AVX512DQ-BW-FCP-NEXT: addq $7624, %rsp # imm = 0x1DC8
25141 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
25142 ; AVX512DQ-BW-FCP-NEXT: retq
25143 %wide.vec = load <448 x i64>, ptr %in.vec, align 64
25144 %strided.vec0 = shufflevector <448 x i64> %wide.vec, <448 x i64> poison, <64 x i32> <i32 0, i32 7, i32 14, i32 21, i32 28, i32 35, i32 42, i32 49, i32 56, i32 63, i32 70, i32 77, i32 84, i32 91, i32 98, i32 105, i32 112, i32 119, i32 126, i32 133, i32 140, i32 147, i32 154, i32 161, i32 168, i32 175, i32 182, i32 189, i32 196, i32 203, i32 210, i32 217, i32 224, i32 231, i32 238, i32 245, i32 252, i32 259, i32 266, i32 273, i32 280, i32 287, i32 294, i32 301, i32 308, i32 315, i32 322, i32 329, i32 336, i32 343, i32 350, i32 357, i32 364, i32 371, i32 378, i32 385, i32 392, i32 399, i32 406, i32 413, i32 420, i32 427, i32 434, i32 441>
25145 %strided.vec1 = shufflevector <448 x i64> %wide.vec, <448 x i64> poison, <64 x i32> <i32 1, i32 8, i32 15, i32 22, i32 29, i32 36, i32 43, i32 50, i32 57, i32 64, i32 71, i32 78, i32 85, i32 92, i32 99, i32 106, i32 113, i32 120, i32 127, i32 134, i32 141, i32 148, i32 155, i32 162, i32 169, i32 176, i32 183, i32 190, i32 197, i32 204, i32 211, i32 218, i32 225, i32 232, i32 239, i32 246, i32 253, i32 260, i32 267, i32 274, i32 281, i32 288, i32 295, i32 302, i32 309, i32 316, i32 323, i32 330, i32 337, i32 344, i32 351, i32 358, i32 365, i32 372, i32 379, i32 386, i32 393, i32 400, i32 407, i32 414, i32 421, i32 428, i32 435, i32 442>
25146 %strided.vec2 = shufflevector <448 x i64> %wide.vec, <448 x i64> poison, <64 x i32> <i32 2, i32 9, i32 16, i32 23, i32 30, i32 37, i32 44, i32 51, i32 58, i32 65, i32 72, i32 79, i32 86, i32 93, i32 100, i32 107, i32 114, i32 121, i32 128, i32 135, i32 142, i32 149, i32 156, i32 163, i32 170, i32 177, i32 184, i32 191, i32 198, i32 205, i32 212, i32 219, i32 226, i32 233, i32 240, i32 247, i32 254, i32 261, i32 268, i32 275, i32 282, i32 289, i32 296, i32 303, i32 310, i32 317, i32 324, i32 331, i32 338, i32 345, i32 352, i32 359, i32 366, i32 373, i32 380, i32 387, i32 394, i32 401, i32 408, i32 415, i32 422, i32 429, i32 436, i32 443>
25147 %strided.vec3 = shufflevector <448 x i64> %wide.vec, <448 x i64> poison, <64 x i32> <i32 3, i32 10, i32 17, i32 24, i32 31, i32 38, i32 45, i32 52, i32 59, i32 66, i32 73, i32 80, i32 87, i32 94, i32 101, i32 108, i32 115, i32 122, i32 129, i32 136, i32 143, i32 150, i32 157, i32 164, i32 171, i32 178, i32 185, i32 192, i32 199, i32 206, i32 213, i32 220, i32 227, i32 234, i32 241, i32 248, i32 255, i32 262, i32 269, i32 276, i32 283, i32 290, i32 297, i32 304, i32 311, i32 318, i32 325, i32 332, i32 339, i32 346, i32 353, i32 360, i32 367, i32 374, i32 381, i32 388, i32 395, i32 402, i32 409, i32 416, i32 423, i32 430, i32 437, i32 444>
25148 %strided.vec4 = shufflevector <448 x i64> %wide.vec, <448 x i64> poison, <64 x i32> <i32 4, i32 11, i32 18, i32 25, i32 32, i32 39, i32 46, i32 53, i32 60, i32 67, i32 74, i32 81, i32 88, i32 95, i32 102, i32 109, i32 116, i32 123, i32 130, i32 137, i32 144, i32 151, i32 158, i32 165, i32 172, i32 179, i32 186, i32 193, i32 200, i32 207, i32 214, i32 221, i32 228, i32 235, i32 242, i32 249, i32 256, i32 263, i32 270, i32 277, i32 284, i32 291, i32 298, i32 305, i32 312, i32 319, i32 326, i32 333, i32 340, i32 347, i32 354, i32 361, i32 368, i32 375, i32 382, i32 389, i32 396, i32 403, i32 410, i32 417, i32 424, i32 431, i32 438, i32 445>
25149 %strided.vec5 = shufflevector <448 x i64> %wide.vec, <448 x i64> poison, <64 x i32> <i32 5, i32 12, i32 19, i32 26, i32 33, i32 40, i32 47, i32 54, i32 61, i32 68, i32 75, i32 82, i32 89, i32 96, i32 103, i32 110, i32 117, i32 124, i32 131, i32 138, i32 145, i32 152, i32 159, i32 166, i32 173, i32 180, i32 187, i32 194, i32 201, i32 208, i32 215, i32 222, i32 229, i32 236, i32 243, i32 250, i32 257, i32 264, i32 271, i32 278, i32 285, i32 292, i32 299, i32 306, i32 313, i32 320, i32 327, i32 334, i32 341, i32 348, i32 355, i32 362, i32 369, i32 376, i32 383, i32 390, i32 397, i32 404, i32 411, i32 418, i32 425, i32 432, i32 439, i32 446>
25150 %strided.vec6 = shufflevector <448 x i64> %wide.vec, <448 x i64> poison, <64 x i32> <i32 6, i32 13, i32 20, i32 27, i32 34, i32 41, i32 48, i32 55, i32 62, i32 69, i32 76, i32 83, i32 90, i32 97, i32 104, i32 111, i32 118, i32 125, i32 132, i32 139, i32 146, i32 153, i32 160, i32 167, i32 174, i32 181, i32 188, i32 195, i32 202, i32 209, i32 216, i32 223, i32 230, i32 237, i32 244, i32 251, i32 258, i32 265, i32 272, i32 279, i32 286, i32 293, i32 300, i32 307, i32 314, i32 321, i32 328, i32 335, i32 342, i32 349, i32 356, i32 363, i32 370, i32 377, i32 384, i32 391, i32 398, i32 405, i32 412, i32 419, i32 426, i32 433, i32 440, i32 447>
25151 store <64 x i64> %strided.vec0, ptr %out.vec0, align 64
25152 store <64 x i64> %strided.vec1, ptr %out.vec1, align 64
25153 store <64 x i64> %strided.vec2, ptr %out.vec2, align 64
25154 store <64 x i64> %strided.vec3, ptr %out.vec3, align 64
25155 store <64 x i64> %strided.vec4, ptr %out.vec4, align 64
25156 store <64 x i64> %strided.vec5, ptr %out.vec5, align 64
25157 store <64 x i64> %strided.vec6, ptr %out.vec6, align 64