1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
16 ; These patterns are produced by LoopVectorizer for interleaved loads.
18 define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
19 ; SSE-LABEL: load_i32_stride8_vf2:
21 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
22 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
23 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
24 ; SSE-NEXT: movdqa (%rdi), %xmm0
25 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
26 ; SSE-NEXT: movdqa 32(%rdi), %xmm2
27 ; SSE-NEXT: movdqa 48(%rdi), %xmm3
28 ; SSE-NEXT: movdqa %xmm0, %xmm4
29 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
30 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
31 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
32 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
33 ; SSE-NEXT: movdqa %xmm1, %xmm6
34 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
35 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
36 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
37 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
38 ; SSE-NEXT: movq %xmm4, (%rsi)
39 ; SSE-NEXT: movq %xmm5, (%rdx)
40 ; SSE-NEXT: movq %xmm0, (%rcx)
41 ; SSE-NEXT: movq %xmm2, (%r8)
42 ; SSE-NEXT: movq %xmm6, (%r9)
43 ; SSE-NEXT: movq %xmm7, (%r11)
44 ; SSE-NEXT: movq %xmm1, (%r10)
45 ; SSE-NEXT: movq %xmm3, (%rax)
48 ; AVX-LABEL: load_i32_stride8_vf2:
50 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
51 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
52 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11
53 ; AVX-NEXT: vmovaps (%rdi), %ymm0
54 ; AVX-NEXT: vmovaps 32(%rdi), %ymm1
55 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2
56 ; AVX-NEXT: vmovdqa (%rdi), %xmm3
57 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
58 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[1,1,1,1]
59 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm2[2,3],xmm5[4,5,6,7]
60 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
61 ; AVX-NEXT: vunpcklps {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
62 ; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3
63 ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
64 ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0,2,3,6,4,6,7]
65 ; AVX-NEXT: vextractf128 $1, %ymm6, %xmm6
66 ; AVX-NEXT: vunpckhps {{.*#+}} ymm7 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
67 ; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7
68 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
69 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
70 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
71 ; AVX-NEXT: vmovq %xmm4, (%rsi)
72 ; AVX-NEXT: vmovq %xmm5, (%rdx)
73 ; AVX-NEXT: vmovq %xmm2, (%rcx)
74 ; AVX-NEXT: vpextrq $1, %xmm2, (%r8)
75 ; AVX-NEXT: vmovlps %xmm3, (%r9)
76 ; AVX-NEXT: vmovlps %xmm6, (%r11)
77 ; AVX-NEXT: vmovlps %xmm7, (%r10)
78 ; AVX-NEXT: vmovlps %xmm0, (%rax)
79 ; AVX-NEXT: vzeroupper
82 ; AVX2-LABEL: load_i32_stride8_vf2:
84 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
85 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
86 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
87 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm0
88 ; AVX2-NEXT: vmovaps (%rdi), %ymm1
89 ; AVX2-NEXT: vmovdqa (%rdi), %xmm2
90 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm3
91 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
92 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
93 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3]
94 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
95 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
96 ; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm3
97 ; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5]
98 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7]
99 ; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6
100 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
101 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
102 ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
103 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
104 ; AVX2-NEXT: vmovq %xmm4, (%rsi)
105 ; AVX2-NEXT: vmovq %xmm5, (%rdx)
106 ; AVX2-NEXT: vmovq %xmm2, (%rcx)
107 ; AVX2-NEXT: vpextrq $1, %xmm2, (%r8)
108 ; AVX2-NEXT: vmovlps %xmm3, (%r9)
109 ; AVX2-NEXT: vmovlps %xmm6, (%r11)
110 ; AVX2-NEXT: vmovlps %xmm1, (%r10)
111 ; AVX2-NEXT: vmovlps %xmm0, (%rax)
112 ; AVX2-NEXT: vzeroupper
115 ; AVX2-FP-LABEL: load_i32_stride8_vf2:
117 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
118 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
119 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11
120 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm0
121 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm1
122 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm2
123 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm3
124 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
125 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
126 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3]
127 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
128 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
129 ; AVX2-FP-NEXT: vextractf128 $1, %ymm3, %xmm3
130 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5]
131 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7]
132 ; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6
133 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
134 ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm1
135 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
136 ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0
137 ; AVX2-FP-NEXT: vmovq %xmm4, (%rsi)
138 ; AVX2-FP-NEXT: vmovq %xmm5, (%rdx)
139 ; AVX2-FP-NEXT: vmovq %xmm2, (%rcx)
140 ; AVX2-FP-NEXT: vpextrq $1, %xmm2, (%r8)
141 ; AVX2-FP-NEXT: vmovlps %xmm3, (%r9)
142 ; AVX2-FP-NEXT: vmovlps %xmm6, (%r11)
143 ; AVX2-FP-NEXT: vmovlps %xmm1, (%r10)
144 ; AVX2-FP-NEXT: vmovlps %xmm0, (%rax)
145 ; AVX2-FP-NEXT: vzeroupper
148 ; AVX2-FCP-LABEL: load_i32_stride8_vf2:
150 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
151 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
152 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
153 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm0
154 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm1
155 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm2
156 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
157 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
158 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
159 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0],xmm3[1],xmm5[2,3]
160 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
161 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
162 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm3, %xmm3
163 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm1[1,1,1,1,5,5,5,5]
164 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm0[1],ymm6[2,3,4],ymm0[5],ymm6[6,7]
165 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6
166 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
167 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm1
168 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7]
169 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
170 ; AVX2-FCP-NEXT: vmovq %xmm4, (%rsi)
171 ; AVX2-FCP-NEXT: vmovq %xmm5, (%rdx)
172 ; AVX2-FCP-NEXT: vmovq %xmm2, (%rcx)
173 ; AVX2-FCP-NEXT: vpextrq $1, %xmm2, (%r8)
174 ; AVX2-FCP-NEXT: vmovlps %xmm3, (%r9)
175 ; AVX2-FCP-NEXT: vmovlps %xmm6, (%r11)
176 ; AVX2-FCP-NEXT: vmovlps %xmm1, (%r10)
177 ; AVX2-FCP-NEXT: vmovlps %xmm0, (%rax)
178 ; AVX2-FCP-NEXT: vzeroupper
179 ; AVX2-FCP-NEXT: retq
181 ; AVX512-LABEL: load_i32_stride8_vf2:
183 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
184 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
185 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
186 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
187 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm1
188 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
189 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
190 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
191 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
192 ; AVX512-NEXT: vmovaps 32(%rdi), %ymm1
193 ; AVX512-NEXT: vmovaps (%rdi), %ymm4
194 ; AVX512-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
195 ; AVX512-NEXT: vextractf128 $1, %ymm5, %xmm5
196 ; AVX512-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5]
197 ; AVX512-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7]
198 ; AVX512-NEXT: vextractf128 $1, %ymm6, %xmm6
199 ; AVX512-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
200 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm4
201 ; AVX512-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
202 ; AVX512-NEXT: vextractf128 $1, %ymm1, %xmm1
203 ; AVX512-NEXT: vmovq %xmm2, (%rsi)
204 ; AVX512-NEXT: vmovq %xmm3, (%rdx)
205 ; AVX512-NEXT: vmovq %xmm0, (%rcx)
206 ; AVX512-NEXT: vpextrq $1, %xmm0, (%r8)
207 ; AVX512-NEXT: vmovlps %xmm5, (%r9)
208 ; AVX512-NEXT: vmovlps %xmm6, (%r11)
209 ; AVX512-NEXT: vmovlps %xmm4, (%r10)
210 ; AVX512-NEXT: vmovlps %xmm1, (%rax)
211 ; AVX512-NEXT: vzeroupper
214 ; AVX512-FCP-LABEL: load_i32_stride8_vf2:
215 ; AVX512-FCP: # %bb.0:
216 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
217 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
218 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
219 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
220 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm1
221 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
222 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
223 ; AVX512-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
224 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
225 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
226 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm4
227 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
228 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
229 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,5,5,13,5,5]
230 ; AVX512-FCP-NEXT: # ymm6 = mem[0,1,0,1]
231 ; AVX512-FCP-NEXT: vpermi2d %ymm1, %ymm4, %ymm6
232 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
233 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
234 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
235 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
236 ; AVX512-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
237 ; AVX512-FCP-NEXT: vmovq %xmm2, (%rsi)
238 ; AVX512-FCP-NEXT: vmovq %xmm3, (%rdx)
239 ; AVX512-FCP-NEXT: vmovq %xmm0, (%rcx)
240 ; AVX512-FCP-NEXT: vpextrq $1, %xmm0, (%r8)
241 ; AVX512-FCP-NEXT: vmovq %xmm5, (%r9)
242 ; AVX512-FCP-NEXT: vmovq %xmm6, (%r11)
243 ; AVX512-FCP-NEXT: vmovq %xmm4, (%r10)
244 ; AVX512-FCP-NEXT: vmovq %xmm1, (%rax)
245 ; AVX512-FCP-NEXT: vzeroupper
246 ; AVX512-FCP-NEXT: retq
248 ; AVX512DQ-LABEL: load_i32_stride8_vf2:
250 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
251 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
252 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
253 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
254 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm1
255 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
256 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
257 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
258 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
259 ; AVX512DQ-NEXT: vmovaps 32(%rdi), %ymm1
260 ; AVX512DQ-NEXT: vmovaps (%rdi), %ymm4
261 ; AVX512DQ-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
262 ; AVX512DQ-NEXT: vextractf128 $1, %ymm5, %xmm5
263 ; AVX512DQ-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5]
264 ; AVX512DQ-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7]
265 ; AVX512DQ-NEXT: vextractf128 $1, %ymm6, %xmm6
266 ; AVX512DQ-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
267 ; AVX512DQ-NEXT: vextractf128 $1, %ymm1, %xmm4
268 ; AVX512DQ-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
269 ; AVX512DQ-NEXT: vextractf128 $1, %ymm1, %xmm1
270 ; AVX512DQ-NEXT: vmovq %xmm2, (%rsi)
271 ; AVX512DQ-NEXT: vmovq %xmm3, (%rdx)
272 ; AVX512DQ-NEXT: vmovq %xmm0, (%rcx)
273 ; AVX512DQ-NEXT: vpextrq $1, %xmm0, (%r8)
274 ; AVX512DQ-NEXT: vmovlps %xmm5, (%r9)
275 ; AVX512DQ-NEXT: vmovlps %xmm6, (%r11)
276 ; AVX512DQ-NEXT: vmovlps %xmm4, (%r10)
277 ; AVX512DQ-NEXT: vmovlps %xmm1, (%rax)
278 ; AVX512DQ-NEXT: vzeroupper
279 ; AVX512DQ-NEXT: retq
281 ; AVX512DQ-FCP-LABEL: load_i32_stride8_vf2:
282 ; AVX512DQ-FCP: # %bb.0:
283 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
284 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
285 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
286 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
287 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm1
288 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
289 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
290 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
291 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
292 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
293 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm4
294 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
295 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
296 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,5,5,13,5,5]
297 ; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,0,1]
298 ; AVX512DQ-FCP-NEXT: vpermi2d %ymm1, %ymm4, %ymm6
299 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
300 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
301 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
302 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
303 ; AVX512DQ-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
304 ; AVX512DQ-FCP-NEXT: vmovq %xmm2, (%rsi)
305 ; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%rdx)
306 ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rcx)
307 ; AVX512DQ-FCP-NEXT: vpextrq $1, %xmm0, (%r8)
308 ; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r9)
309 ; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%r11)
310 ; AVX512DQ-FCP-NEXT: vmovq %xmm4, (%r10)
311 ; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%rax)
312 ; AVX512DQ-FCP-NEXT: vzeroupper
313 ; AVX512DQ-FCP-NEXT: retq
315 ; AVX512BW-LABEL: load_i32_stride8_vf2:
317 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
318 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
319 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
320 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
321 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm1
322 ; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
323 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
324 ; AVX512BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
325 ; AVX512BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
326 ; AVX512BW-NEXT: vmovaps 32(%rdi), %ymm1
327 ; AVX512BW-NEXT: vmovaps (%rdi), %ymm4
328 ; AVX512BW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
329 ; AVX512BW-NEXT: vextractf128 $1, %ymm5, %xmm5
330 ; AVX512BW-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5]
331 ; AVX512BW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7]
332 ; AVX512BW-NEXT: vextractf128 $1, %ymm6, %xmm6
333 ; AVX512BW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
334 ; AVX512BW-NEXT: vextractf128 $1, %ymm1, %xmm4
335 ; AVX512BW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
336 ; AVX512BW-NEXT: vextractf128 $1, %ymm1, %xmm1
337 ; AVX512BW-NEXT: vmovq %xmm2, (%rsi)
338 ; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
339 ; AVX512BW-NEXT: vmovq %xmm0, (%rcx)
340 ; AVX512BW-NEXT: vpextrq $1, %xmm0, (%r8)
341 ; AVX512BW-NEXT: vmovlps %xmm5, (%r9)
342 ; AVX512BW-NEXT: vmovlps %xmm6, (%r11)
343 ; AVX512BW-NEXT: vmovlps %xmm4, (%r10)
344 ; AVX512BW-NEXT: vmovlps %xmm1, (%rax)
345 ; AVX512BW-NEXT: vzeroupper
346 ; AVX512BW-NEXT: retq
348 ; AVX512BW-FCP-LABEL: load_i32_stride8_vf2:
349 ; AVX512BW-FCP: # %bb.0:
350 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
351 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
352 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
353 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
354 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1
355 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
356 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
357 ; AVX512BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
358 ; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
359 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
360 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm4
361 ; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
362 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
363 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,5,5,13,5,5]
364 ; AVX512BW-FCP-NEXT: # ymm6 = mem[0,1,0,1]
365 ; AVX512BW-FCP-NEXT: vpermi2d %ymm1, %ymm4, %ymm6
366 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
367 ; AVX512BW-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
368 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
369 ; AVX512BW-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
370 ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
371 ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rsi)
372 ; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx)
373 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rcx)
374 ; AVX512BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8)
375 ; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9)
376 ; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r11)
377 ; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r10)
378 ; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax)
379 ; AVX512BW-FCP-NEXT: vzeroupper
380 ; AVX512BW-FCP-NEXT: retq
382 ; AVX512DQ-BW-LABEL: load_i32_stride8_vf2:
383 ; AVX512DQ-BW: # %bb.0:
384 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
385 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
386 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
387 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
388 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm1
389 ; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
390 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
391 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
392 ; AVX512DQ-BW-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
393 ; AVX512DQ-BW-NEXT: vmovaps 32(%rdi), %ymm1
394 ; AVX512DQ-BW-NEXT: vmovaps (%rdi), %ymm4
395 ; AVX512DQ-BW-NEXT: vunpcklps {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
396 ; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm5, %xmm5
397 ; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm6 = ymm4[1,1,1,1,5,5,5,5]
398 ; AVX512DQ-BW-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0],ymm1[1],ymm6[2,3,4],ymm1[5],ymm6[6,7]
399 ; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm6, %xmm6
400 ; AVX512DQ-BW-NEXT: vunpckhps {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
401 ; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm1, %xmm4
402 ; AVX512DQ-BW-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
403 ; AVX512DQ-BW-NEXT: vextractf128 $1, %ymm1, %xmm1
404 ; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rsi)
405 ; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx)
406 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rcx)
407 ; AVX512DQ-BW-NEXT: vpextrq $1, %xmm0, (%r8)
408 ; AVX512DQ-BW-NEXT: vmovlps %xmm5, (%r9)
409 ; AVX512DQ-BW-NEXT: vmovlps %xmm6, (%r11)
410 ; AVX512DQ-BW-NEXT: vmovlps %xmm4, (%r10)
411 ; AVX512DQ-BW-NEXT: vmovlps %xmm1, (%rax)
412 ; AVX512DQ-BW-NEXT: vzeroupper
413 ; AVX512DQ-BW-NEXT: retq
415 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride8_vf2:
416 ; AVX512DQ-BW-FCP: # %bb.0:
417 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
418 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
419 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
420 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
421 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm1
422 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
423 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,5,0,0]
424 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %xmm1, %xmm0, %xmm3
425 ; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
426 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm1
427 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm4
428 ; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} ymm5 = ymm4[0],ymm1[0],ymm4[1],ymm1[1],ymm4[4],ymm1[4],ymm4[5],ymm1[5]
429 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm5
430 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [5,13,5,5,5,13,5,5]
431 ; AVX512DQ-BW-FCP-NEXT: # ymm6 = mem[0,1,0,1]
432 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %ymm1, %ymm4, %ymm6
433 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm6
434 ; AVX512DQ-BW-FCP-NEXT: vpunpckhdq {{.*#+}} ymm1 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
435 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm4
436 ; AVX512DQ-BW-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
437 ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm1
438 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rsi)
439 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx)
440 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rcx)
441 ; AVX512DQ-BW-FCP-NEXT: vpextrq $1, %xmm0, (%r8)
442 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9)
443 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r11)
444 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r10)
445 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax)
446 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
447 ; AVX512DQ-BW-FCP-NEXT: retq
448 %wide.vec = load <16 x i32>, ptr %in.vec, align 64
449 %strided.vec0 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 0, i32 8>
450 %strided.vec1 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 1, i32 9>
451 %strided.vec2 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 2, i32 10>
452 %strided.vec3 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 3, i32 11>
453 %strided.vec4 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 4, i32 12>
454 %strided.vec5 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 5, i32 13>
455 %strided.vec6 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 6, i32 14>
456 %strided.vec7 = shufflevector <16 x i32> %wide.vec, <16 x i32> poison, <2 x i32> <i32 7, i32 15>
457 store <2 x i32> %strided.vec0, ptr %out.vec0, align 64
458 store <2 x i32> %strided.vec1, ptr %out.vec1, align 64
459 store <2 x i32> %strided.vec2, ptr %out.vec2, align 64
460 store <2 x i32> %strided.vec3, ptr %out.vec3, align 64
461 store <2 x i32> %strided.vec4, ptr %out.vec4, align 64
462 store <2 x i32> %strided.vec5, ptr %out.vec5, align 64
463 store <2 x i32> %strided.vec6, ptr %out.vec6, align 64
464 store <2 x i32> %strided.vec7, ptr %out.vec7, align 64
468 define void @load_i32_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
469 ; SSE-LABEL: load_i32_stride8_vf4:
471 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
472 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
473 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
474 ; SSE-NEXT: movaps 112(%rdi), %xmm3
475 ; SSE-NEXT: movaps 80(%rdi), %xmm2
476 ; SSE-NEXT: movaps (%rdi), %xmm1
477 ; SSE-NEXT: movaps 16(%rdi), %xmm0
478 ; SSE-NEXT: movaps 32(%rdi), %xmm4
479 ; SSE-NEXT: movaps 48(%rdi), %xmm5
480 ; SSE-NEXT: movaps 96(%rdi), %xmm6
481 ; SSE-NEXT: movaps 64(%rdi), %xmm7
482 ; SSE-NEXT: movaps %xmm7, %xmm8
483 ; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
484 ; SSE-NEXT: movaps %xmm1, %xmm9
485 ; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
486 ; SSE-NEXT: movaps %xmm9, %xmm10
487 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm8[0]
488 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm8[1]
489 ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
490 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
491 ; SSE-NEXT: movaps %xmm1, %xmm4
492 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0]
493 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm7[1]
494 ; SSE-NEXT: movaps %xmm2, %xmm6
495 ; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
496 ; SSE-NEXT: movaps %xmm0, %xmm7
497 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
498 ; SSE-NEXT: movaps %xmm7, %xmm8
499 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm6[0]
500 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm6[1]
501 ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
502 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
503 ; SSE-NEXT: movaps %xmm0, %xmm3
504 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
505 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
506 ; SSE-NEXT: movaps %xmm10, (%rsi)
507 ; SSE-NEXT: movaps %xmm9, (%rdx)
508 ; SSE-NEXT: movaps %xmm4, (%rcx)
509 ; SSE-NEXT: movaps %xmm1, (%r8)
510 ; SSE-NEXT: movaps %xmm8, (%r9)
511 ; SSE-NEXT: movaps %xmm7, (%r11)
512 ; SSE-NEXT: movaps %xmm3, (%r10)
513 ; SSE-NEXT: movaps %xmm0, (%rax)
516 ; AVX-LABEL: load_i32_stride8_vf4:
518 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
519 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
520 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11
521 ; AVX-NEXT: vmovaps (%rdi), %ymm0
522 ; AVX-NEXT: vmovaps 32(%rdi), %ymm1
523 ; AVX-NEXT: vmovaps 64(%rdi), %ymm2
524 ; AVX-NEXT: vmovaps 96(%rdi), %ymm3
525 ; AVX-NEXT: vmovaps 32(%rdi), %xmm5
526 ; AVX-NEXT: vmovaps (%rdi), %xmm6
527 ; AVX-NEXT: vunpcklps {{.*#+}} xmm4 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
528 ; AVX-NEXT: vmovaps 96(%rdi), %xmm7
529 ; AVX-NEXT: vmovaps 64(%rdi), %xmm8
530 ; AVX-NEXT: vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
531 ; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm9[0]
532 ; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm6[1,1,1,1]
533 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm5[1],xmm10[2,3]
534 ; AVX-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
535 ; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm7[2,2,2,2]
536 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3]
537 ; AVX-NEXT: vunpckhps {{.*#+}} xmm5 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
538 ; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1],xmm10[2,3]
539 ; AVX-NEXT: vunpckhps {{.*#+}} xmm7 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
540 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1]
541 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm7 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
542 ; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[0,1,2,0,4,5,6,4]
543 ; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7
544 ; AVX-NEXT: vunpcklps {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
545 ; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8
546 ; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3]
547 ; AVX-NEXT: vunpcklps {{.*#+}} ymm8 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
548 ; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8
549 ; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
550 ; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[2,0,2,3,6,4,6,7]
551 ; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
552 ; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
553 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm10 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
554 ; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
555 ; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
556 ; AVX-NEXT: vunpckhps {{.*#+}} ymm11 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
557 ; AVX-NEXT: vextractf128 $1, %ymm11, %xmm11
558 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
559 ; AVX-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
560 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
561 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
562 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
563 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
564 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
565 ; AVX-NEXT: vmovaps %xmm4, (%rsi)
566 ; AVX-NEXT: vmovaps %xmm9, (%rdx)
567 ; AVX-NEXT: vmovaps %xmm6, (%rcx)
568 ; AVX-NEXT: vmovaps %xmm5, (%r8)
569 ; AVX-NEXT: vmovaps %xmm7, (%r9)
570 ; AVX-NEXT: vmovaps %xmm8, (%r11)
571 ; AVX-NEXT: vmovaps %xmm10, (%r10)
572 ; AVX-NEXT: vmovaps %xmm0, (%rax)
573 ; AVX-NEXT: vzeroupper
576 ; AVX2-LABEL: load_i32_stride8_vf4:
578 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
579 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
580 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
581 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm0
582 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm1
583 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm2
584 ; AVX2-NEXT: vmovaps (%rdi), %ymm3
585 ; AVX2-NEXT: vmovaps 96(%rdi), %xmm4
586 ; AVX2-NEXT: vbroadcastss %xmm4, %xmm5
587 ; AVX2-NEXT: vmovaps (%rdi), %xmm6
588 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm7
589 ; AVX2-NEXT: vmovaps 64(%rdi), %xmm8
590 ; AVX2-NEXT: vbroadcastss %xmm8, %xmm9
591 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
592 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm9 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
593 ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3]
594 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
595 ; AVX2-NEXT: vshufps {{.*#+}} xmm10 = xmm6[1,1,1,1]
596 ; AVX2-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm7[1],xmm10[2,3]
597 ; AVX2-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
598 ; AVX2-NEXT: vshufps {{.*#+}} xmm10 = xmm4[2,2,2,2]
599 ; AVX2-NEXT: vblendps {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3]
600 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
601 ; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm10[2,3]
602 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm4[2],xmm8[3],xmm4[3]
603 ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm4[1]
604 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
605 ; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6
606 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
607 ; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm8[2,2,2,2]
608 ; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm10[2,3]
609 ; AVX2-NEXT: vextractf128 $1, %ymm8, %xmm8
610 ; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm3[1,1,1,1,5,5,5,5]
611 ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7]
612 ; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm10
613 ; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
614 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
615 ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
616 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
617 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2]
618 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
619 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
620 ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7]
621 ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2
622 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
623 ; AVX2-NEXT: vmovaps %xmm5, (%rsi)
624 ; AVX2-NEXT: vmovaps %xmm9, (%rdx)
625 ; AVX2-NEXT: vmovaps %xmm7, (%rcx)
626 ; AVX2-NEXT: vmovaps %xmm4, (%r8)
627 ; AVX2-NEXT: vmovaps %xmm6, (%r9)
628 ; AVX2-NEXT: vmovaps %xmm8, (%r11)
629 ; AVX2-NEXT: vmovaps %xmm1, (%r10)
630 ; AVX2-NEXT: vmovaps %xmm0, (%rax)
631 ; AVX2-NEXT: vzeroupper
634 ; AVX2-FP-LABEL: load_i32_stride8_vf4:
636 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
637 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
638 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11
639 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm0
640 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm1
641 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm2
642 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm3
643 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm4
644 ; AVX2-FP-NEXT: vbroadcastss %xmm4, %xmm5
645 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm6
646 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm7
647 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm8
648 ; AVX2-FP-NEXT: vbroadcastss %xmm8, %xmm9
649 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
650 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm9 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
651 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3]
652 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
653 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm10 = xmm6[1,1,1,1]
654 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm7[1],xmm10[2,3]
655 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
656 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm10 = xmm4[2,2,2,2]
657 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3]
658 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
659 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm10[2,3]
660 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm4[2],xmm8[3],xmm4[3]
661 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm4[1]
662 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
663 ; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6
664 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
665 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm8[2,2,2,2]
666 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm10[2,3]
667 ; AVX2-FP-NEXT: vextractf128 $1, %ymm8, %xmm8
668 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm3[1,1,1,1,5,5,5,5]
669 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7]
670 ; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm10
671 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
672 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
673 ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm3
674 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
675 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2]
676 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
677 ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0
678 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7]
679 ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2
680 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
681 ; AVX2-FP-NEXT: vmovaps %xmm5, (%rsi)
682 ; AVX2-FP-NEXT: vmovaps %xmm9, (%rdx)
683 ; AVX2-FP-NEXT: vmovaps %xmm7, (%rcx)
684 ; AVX2-FP-NEXT: vmovaps %xmm4, (%r8)
685 ; AVX2-FP-NEXT: vmovaps %xmm6, (%r9)
686 ; AVX2-FP-NEXT: vmovaps %xmm8, (%r11)
687 ; AVX2-FP-NEXT: vmovaps %xmm1, (%r10)
688 ; AVX2-FP-NEXT: vmovaps %xmm0, (%rax)
689 ; AVX2-FP-NEXT: vzeroupper
692 ; AVX2-FCP-LABEL: load_i32_stride8_vf4:
694 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
695 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
696 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
697 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm0
698 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm1
699 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm2
700 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm3
701 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm4
702 ; AVX2-FCP-NEXT: vbroadcastss %xmm4, %xmm5
703 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm6
704 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm7
705 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm8
706 ; AVX2-FCP-NEXT: vbroadcastss %xmm8, %xmm9
707 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
708 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm9 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
709 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm9[0,1],xmm5[2,3]
710 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm9 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
711 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm10 = xmm6[1,1,1,1]
712 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm7[1],xmm10[2,3]
713 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
714 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm10 = xmm4[2,2,2,2]
715 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm10 = xmm8[0,1,2],xmm10[3]
716 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm6 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
717 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm10[2,3]
718 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm4[2],xmm8[3],xmm4[3]
719 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm6[1],xmm4[1]
720 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
721 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6
722 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
723 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm8[2,2,2,2]
724 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm10[2,3]
725 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm8, %xmm8
726 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm10 = ymm3[1,1,1,1,5,5,5,5]
727 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm2[1],ymm10[2,3,4],ymm2[5],ymm10[6,7]
728 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm10
729 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm8 = xmm10[0,1],xmm8[2,3]
730 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
731 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm3
732 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
733 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2]
734 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
735 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
736 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7]
737 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2
738 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
739 ; AVX2-FCP-NEXT: vmovaps %xmm5, (%rsi)
740 ; AVX2-FCP-NEXT: vmovaps %xmm9, (%rdx)
741 ; AVX2-FCP-NEXT: vmovaps %xmm7, (%rcx)
742 ; AVX2-FCP-NEXT: vmovaps %xmm4, (%r8)
743 ; AVX2-FCP-NEXT: vmovaps %xmm6, (%r9)
744 ; AVX2-FCP-NEXT: vmovaps %xmm8, (%r11)
745 ; AVX2-FCP-NEXT: vmovaps %xmm1, (%r10)
746 ; AVX2-FCP-NEXT: vmovaps %xmm0, (%rax)
747 ; AVX2-FCP-NEXT: vzeroupper
748 ; AVX2-FCP-NEXT: retq
750 ; AVX512-LABEL: load_i32_stride8_vf4:
752 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
753 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
754 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
755 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24]
756 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1
757 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2
758 ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm0
759 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25]
760 ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
761 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26]
762 ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
763 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27]
764 ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm5
765 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28]
766 ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm6
767 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29]
768 ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
769 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30]
770 ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm8
771 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31]
772 ; AVX512-NEXT: vpermi2d %zmm2, %zmm1, %zmm9
773 ; AVX512-NEXT: vmovdqa %xmm0, (%rsi)
774 ; AVX512-NEXT: vmovdqa %xmm3, (%rdx)
775 ; AVX512-NEXT: vmovdqa %xmm4, (%rcx)
776 ; AVX512-NEXT: vmovdqa %xmm5, (%r8)
777 ; AVX512-NEXT: vmovdqa %xmm6, (%r9)
778 ; AVX512-NEXT: vmovdqa %xmm7, (%r11)
779 ; AVX512-NEXT: vmovdqa %xmm8, (%r10)
780 ; AVX512-NEXT: vmovdqa %xmm9, (%rax)
781 ; AVX512-NEXT: vzeroupper
784 ; AVX512-FCP-LABEL: load_i32_stride8_vf4:
785 ; AVX512-FCP: # %bb.0:
786 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
787 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
788 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
789 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24]
790 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
791 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
792 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0
793 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25]
794 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
795 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26]
796 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
797 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27]
798 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5
799 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28]
800 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6
801 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29]
802 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
803 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30]
804 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8
805 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31]
806 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9
807 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rsi)
808 ; AVX512-FCP-NEXT: vmovdqa %xmm3, (%rdx)
809 ; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rcx)
810 ; AVX512-FCP-NEXT: vmovdqa %xmm5, (%r8)
811 ; AVX512-FCP-NEXT: vmovdqa %xmm6, (%r9)
812 ; AVX512-FCP-NEXT: vmovdqa %xmm7, (%r11)
813 ; AVX512-FCP-NEXT: vmovdqa %xmm8, (%r10)
814 ; AVX512-FCP-NEXT: vmovdqa %xmm9, (%rax)
815 ; AVX512-FCP-NEXT: vzeroupper
816 ; AVX512-FCP-NEXT: retq
818 ; AVX512DQ-LABEL: load_i32_stride8_vf4:
820 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
821 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
822 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
823 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24]
824 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm1
825 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm2
826 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm0
827 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25]
828 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
829 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26]
830 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
831 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27]
832 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm5
833 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28]
834 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm6
835 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29]
836 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
837 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30]
838 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm8
839 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31]
840 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm1, %zmm9
841 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%rsi)
842 ; AVX512DQ-NEXT: vmovdqa %xmm3, (%rdx)
843 ; AVX512DQ-NEXT: vmovdqa %xmm4, (%rcx)
844 ; AVX512DQ-NEXT: vmovdqa %xmm5, (%r8)
845 ; AVX512DQ-NEXT: vmovdqa %xmm6, (%r9)
846 ; AVX512DQ-NEXT: vmovdqa %xmm7, (%r11)
847 ; AVX512DQ-NEXT: vmovdqa %xmm8, (%r10)
848 ; AVX512DQ-NEXT: vmovdqa %xmm9, (%rax)
849 ; AVX512DQ-NEXT: vzeroupper
850 ; AVX512DQ-NEXT: retq
852 ; AVX512DQ-FCP-LABEL: load_i32_stride8_vf4:
853 ; AVX512DQ-FCP: # %bb.0:
854 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
855 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
856 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
857 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24]
858 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
859 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
860 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0
861 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25]
862 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
863 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26]
864 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
865 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27]
866 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5
867 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28]
868 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6
869 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29]
870 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
871 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30]
872 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8
873 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31]
874 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9
875 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rsi)
876 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%rdx)
877 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rcx)
878 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, (%r8)
879 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, (%r9)
880 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm7, (%r11)
881 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, (%r10)
882 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, (%rax)
883 ; AVX512DQ-FCP-NEXT: vzeroupper
884 ; AVX512DQ-FCP-NEXT: retq
886 ; AVX512BW-LABEL: load_i32_stride8_vf4:
888 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
889 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
890 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
891 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24]
892 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
893 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2
894 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm0
895 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25]
896 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
897 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26]
898 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
899 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27]
900 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm5
901 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28]
902 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm6
903 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29]
904 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
905 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30]
906 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm8
907 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31]
908 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm9
909 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
910 ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx)
911 ; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx)
912 ; AVX512BW-NEXT: vmovdqa %xmm5, (%r8)
913 ; AVX512BW-NEXT: vmovdqa %xmm6, (%r9)
914 ; AVX512BW-NEXT: vmovdqa %xmm7, (%r11)
915 ; AVX512BW-NEXT: vmovdqa %xmm8, (%r10)
916 ; AVX512BW-NEXT: vmovdqa %xmm9, (%rax)
917 ; AVX512BW-NEXT: vzeroupper
918 ; AVX512BW-NEXT: retq
920 ; AVX512BW-FCP-LABEL: load_i32_stride8_vf4:
921 ; AVX512BW-FCP: # %bb.0:
922 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
923 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
924 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
925 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24]
926 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
927 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
928 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0
929 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25]
930 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
931 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26]
932 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
933 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27]
934 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5
935 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28]
936 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6
937 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29]
938 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
939 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30]
940 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8
941 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31]
942 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9
943 ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi)
944 ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx)
945 ; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rcx)
946 ; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%r8)
947 ; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%r9)
948 ; AVX512BW-FCP-NEXT: vmovdqa %xmm7, (%r11)
949 ; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%r10)
950 ; AVX512BW-FCP-NEXT: vmovdqa %xmm9, (%rax)
951 ; AVX512BW-FCP-NEXT: vzeroupper
952 ; AVX512BW-FCP-NEXT: retq
954 ; AVX512DQ-BW-LABEL: load_i32_stride8_vf4:
955 ; AVX512DQ-BW: # %bb.0:
956 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
957 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
958 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
959 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24]
960 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1
961 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2
962 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm0
963 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25]
964 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
965 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26]
966 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
967 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27]
968 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm5
969 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28]
970 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm6
971 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29]
972 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
973 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30]
974 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm8
975 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31]
976 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm1, %zmm9
977 ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rsi)
978 ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx)
979 ; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rcx)
980 ; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%r8)
981 ; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%r9)
982 ; AVX512DQ-BW-NEXT: vmovdqa %xmm7, (%r11)
983 ; AVX512DQ-BW-NEXT: vmovdqa %xmm8, (%r10)
984 ; AVX512DQ-BW-NEXT: vmovdqa %xmm9, (%rax)
985 ; AVX512DQ-BW-NEXT: vzeroupper
986 ; AVX512DQ-BW-NEXT: retq
988 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride8_vf4:
989 ; AVX512DQ-BW-FCP: # %bb.0:
990 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
991 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
992 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
993 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,8,16,24]
994 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
995 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
996 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm0
997 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [1,9,17,25]
998 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm3
999 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [2,10,18,26]
1000 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm4
1001 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [3,11,19,27]
1002 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm5
1003 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [4,12,20,28]
1004 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm6
1005 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [5,13,21,29]
1006 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm7
1007 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [6,14,22,30]
1008 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm8
1009 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [7,15,23,31]
1010 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm1, %zmm9
1011 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi)
1012 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx)
1013 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rcx)
1014 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%r8)
1015 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%r9)
1016 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm7, (%r11)
1017 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%r10)
1018 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm9, (%rax)
1019 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
1020 ; AVX512DQ-BW-FCP-NEXT: retq
1021 %wide.vec = load <32 x i32>, ptr %in.vec, align 64
1022 %strided.vec0 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
1023 %strided.vec1 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 1, i32 9, i32 17, i32 25>
1024 %strided.vec2 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
1025 %strided.vec3 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
1026 %strided.vec4 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 4, i32 12, i32 20, i32 28>
1027 %strided.vec5 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 5, i32 13, i32 21, i32 29>
1028 %strided.vec6 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 6, i32 14, i32 22, i32 30>
1029 %strided.vec7 = shufflevector <32 x i32> %wide.vec, <32 x i32> poison, <4 x i32> <i32 7, i32 15, i32 23, i32 31>
1030 store <4 x i32> %strided.vec0, ptr %out.vec0, align 64
1031 store <4 x i32> %strided.vec1, ptr %out.vec1, align 64
1032 store <4 x i32> %strided.vec2, ptr %out.vec2, align 64
1033 store <4 x i32> %strided.vec3, ptr %out.vec3, align 64
1034 store <4 x i32> %strided.vec4, ptr %out.vec4, align 64
1035 store <4 x i32> %strided.vec5, ptr %out.vec5, align 64
1036 store <4 x i32> %strided.vec6, ptr %out.vec6, align 64
1037 store <4 x i32> %strided.vec7, ptr %out.vec7, align 64
1041 define void @load_i32_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
1042 ; SSE-LABEL: load_i32_stride8_vf8:
1044 ; SSE-NEXT: movaps 112(%rdi), %xmm15
1045 ; SSE-NEXT: movaps 176(%rdi), %xmm4
1046 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1047 ; SSE-NEXT: movaps 144(%rdi), %xmm9
1048 ; SSE-NEXT: movaps (%rdi), %xmm10
1049 ; SSE-NEXT: movaps 32(%rdi), %xmm1
1050 ; SSE-NEXT: movaps 96(%rdi), %xmm13
1051 ; SSE-NEXT: movaps 64(%rdi), %xmm11
1052 ; SSE-NEXT: movaps 160(%rdi), %xmm2
1053 ; SSE-NEXT: movaps 128(%rdi), %xmm6
1054 ; SSE-NEXT: movaps 224(%rdi), %xmm12
1055 ; SSE-NEXT: movaps 192(%rdi), %xmm0
1056 ; SSE-NEXT: movaps %xmm0, %xmm8
1057 ; SSE-NEXT: unpcklps {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1]
1058 ; SSE-NEXT: movaps %xmm6, %xmm5
1059 ; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
1060 ; SSE-NEXT: movaps %xmm5, %xmm7
1061 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm8[0]
1062 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1063 ; SSE-NEXT: movaps %xmm11, %xmm14
1064 ; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
1065 ; SSE-NEXT: movaps %xmm10, %xmm7
1066 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
1067 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm8[1]
1068 ; SSE-NEXT: movaps %xmm7, %xmm8
1069 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm14[0]
1070 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1071 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm14[1]
1072 ; SSE-NEXT: movaps 240(%rdi), %xmm14
1073 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1074 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
1075 ; SSE-NEXT: movaps 208(%rdi), %xmm12
1076 ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm2[2],xmm6[3],xmm2[3]
1077 ; SSE-NEXT: movaps %xmm6, %xmm2
1078 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
1079 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1080 ; SSE-NEXT: unpckhps {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3]
1081 ; SSE-NEXT: unpckhps {{.*#+}} xmm10 = xmm10[2],xmm1[2],xmm10[3],xmm1[3]
1082 ; SSE-NEXT: movaps %xmm10, %xmm8
1083 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm11[0]
1084 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm0[1]
1085 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm11[1]
1086 ; SSE-NEXT: movaps %xmm12, %xmm0
1087 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1]
1088 ; SSE-NEXT: movaps %xmm9, %xmm11
1089 ; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1]
1090 ; SSE-NEXT: movaps %xmm11, %xmm13
1091 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm0[0]
1092 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1]
1093 ; SSE-NEXT: movaps 80(%rdi), %xmm2
1094 ; SSE-NEXT: movaps %xmm2, %xmm1
1095 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1]
1096 ; SSE-NEXT: movaps 16(%rdi), %xmm0
1097 ; SSE-NEXT: movaps 48(%rdi), %xmm3
1098 ; SSE-NEXT: movaps %xmm0, %xmm14
1099 ; SSE-NEXT: unpcklps {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1]
1100 ; SSE-NEXT: movaps %xmm14, %xmm4
1101 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm1[0]
1102 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm1[1]
1103 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
1104 ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
1105 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
1106 ; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3]
1107 ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm15[2],xmm2[3],xmm15[3]
1108 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1109 ; SSE-NEXT: movaps %xmm9, %xmm1
1110 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm12[0]
1111 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm12[1]
1112 ; SSE-NEXT: movaps %xmm0, %xmm3
1113 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm2[0]
1114 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
1115 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1116 ; SSE-NEXT: movaps %xmm2, (%rsi)
1117 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1118 ; SSE-NEXT: movaps %xmm2, 16(%rsi)
1119 ; SSE-NEXT: movaps %xmm7, (%rdx)
1120 ; SSE-NEXT: movaps %xmm5, 16(%rdx)
1121 ; SSE-NEXT: movaps %xmm8, (%rcx)
1122 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1123 ; SSE-NEXT: movaps %xmm2, 16(%rcx)
1124 ; SSE-NEXT: movaps %xmm10, (%r8)
1125 ; SSE-NEXT: movaps %xmm6, 16(%r8)
1126 ; SSE-NEXT: movaps %xmm4, (%r9)
1127 ; SSE-NEXT: movaps %xmm13, 16(%r9)
1128 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1129 ; SSE-NEXT: movaps %xmm14, (%rax)
1130 ; SSE-NEXT: movaps %xmm11, 16(%rax)
1131 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1132 ; SSE-NEXT: movaps %xmm3, (%rax)
1133 ; SSE-NEXT: movaps %xmm1, 16(%rax)
1134 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1135 ; SSE-NEXT: movaps %xmm9, 16(%rax)
1136 ; SSE-NEXT: movaps %xmm0, (%rax)
1139 ; AVX-LABEL: load_i32_stride8_vf8:
1141 ; AVX-NEXT: vmovaps (%rdi), %ymm0
1142 ; AVX-NEXT: vmovaps 32(%rdi), %ymm1
1143 ; AVX-NEXT: vmovaps 64(%rdi), %ymm2
1144 ; AVX-NEXT: vmovaps 96(%rdi), %ymm3
1145 ; AVX-NEXT: vmovaps 32(%rdi), %xmm8
1146 ; AVX-NEXT: vmovaps (%rdi), %xmm11
1147 ; AVX-NEXT: vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
1148 ; AVX-NEXT: vmovaps 96(%rdi), %xmm9
1149 ; AVX-NEXT: vmovaps 64(%rdi), %xmm10
1150 ; AVX-NEXT: vmovaps 160(%rdi), %xmm14
1151 ; AVX-NEXT: vmovaps 128(%rdi), %xmm15
1152 ; AVX-NEXT: vunpcklps {{.*#+}} xmm5 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
1153 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
1154 ; AVX-NEXT: vmovaps 224(%rdi), %xmm12
1155 ; AVX-NEXT: vmovaps 192(%rdi), %xmm13
1156 ; AVX-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
1157 ; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm6[0,1,0,1]
1158 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
1159 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7]
1160 ; AVX-NEXT: vunpcklps {{.*#+}} xmm7 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
1161 ; AVX-NEXT: vmovlhps {{.*#+}} xmm4 = xmm4[0],xmm7[0]
1162 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
1163 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1164 ; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm11[1,1,1,1]
1165 ; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm8[1],xmm5[2,3]
1166 ; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0,1],xmm7[2,3]
1167 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
1168 ; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm15[1,1,1,1]
1169 ; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm14[1],xmm7[2,3]
1170 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
1171 ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm6[6,7]
1172 ; AVX-NEXT: vmovaps 160(%rdi), %ymm6
1173 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
1174 ; AVX-NEXT: vmovaps 128(%rdi), %ymm7
1175 ; AVX-NEXT: vunpckhps {{.*#+}} xmm8 = xmm11[2],xmm8[2],xmm11[3],xmm8[3]
1176 ; AVX-NEXT: vunpckhps {{.*#+}} xmm15 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
1177 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm11
1178 ; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm12[2,2,2,2]
1179 ; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm13[0,1,2],xmm14[3]
1180 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
1181 ; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7]
1182 ; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm9[2,2,2,2]
1183 ; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm10[0,1,2],xmm14[3]
1184 ; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm8[0,1],xmm14[2,3]
1185 ; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm11[4,5,6,7]
1186 ; AVX-NEXT: vmovaps 192(%rdi), %ymm11
1187 ; AVX-NEXT: vunpckhps {{.*#+}} xmm9 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
1188 ; AVX-NEXT: vmovaps 224(%rdi), %ymm10
1189 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1]
1190 ; AVX-NEXT: vunpckhps {{.*#+}} xmm9 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
1191 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
1192 ; AVX-NEXT: vshufps {{.*#+}} xmm12 = xmm15[2,3,2,3]
1193 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
1194 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7]
1195 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm8[0,1,2,3],ymm9[4,5,6,7]
1196 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm10[0],ymm11[0],ymm10[2],ymm11[2]
1197 ; AVX-NEXT: vunpcklps {{.*#+}} ymm12 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
1198 ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm12[0,1],ymm8[2,0],ymm12[4,5],ymm8[6,4]
1199 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
1200 ; AVX-NEXT: vshufps {{.*#+}} ymm12 = ymm12[0,1,2,0,4,5,6,4]
1201 ; AVX-NEXT: vextractf128 $1, %ymm12, %xmm12
1202 ; AVX-NEXT: vunpcklps {{.*#+}} ymm13 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
1203 ; AVX-NEXT: vextractf128 $1, %ymm13, %xmm13
1204 ; AVX-NEXT: vblendps {{.*#+}} xmm12 = xmm13[0,1],xmm12[2,3]
1205 ; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm8[4,5,6,7]
1206 ; AVX-NEXT: vunpcklps {{.*#+}} ymm8 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[4],ymm10[4],ymm11[5],ymm10[5]
1207 ; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm6[1,0],ymm7[1,0],ymm6[5,4],ymm7[5,4]
1208 ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm13[2,0],ymm8[2,3],ymm13[6,4],ymm8[6,7]
1209 ; AVX-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
1210 ; AVX-NEXT: vextractf128 $1, %ymm13, %xmm13
1211 ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
1212 ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7]
1213 ; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15
1214 ; AVX-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
1215 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
1216 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm13 = ymm10[1],ymm11[1],ymm10[3],ymm11[3]
1217 ; AVX-NEXT: vunpckhps {{.*#+}} ymm15 = ymm7[2],ymm6[2],ymm7[3],ymm6[3],ymm7[6],ymm6[6],ymm7[7],ymm6[7]
1218 ; AVX-NEXT: vshufps {{.*#+}} ymm13 = ymm15[0,1],ymm13[2,0],ymm15[4,5],ymm13[6,4]
1219 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
1220 ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
1221 ; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15
1222 ; AVX-NEXT: vunpckhps {{.*#+}} ymm4 = ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[6],ymm1[6],ymm0[7],ymm1[7]
1223 ; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4
1224 ; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,3]
1225 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm13[4,5,6,7]
1226 ; AVX-NEXT: vunpckhps {{.*#+}} ymm10 = ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[6],ymm10[6],ymm11[7],ymm10[7]
1227 ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[3,0],ymm7[3,0],ymm6[7,4],ymm7[7,4]
1228 ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm6[2,0],ymm10[2,3],ymm6[6,4],ymm10[6,7]
1229 ; AVX-NEXT: vunpckhps {{.*#+}} ymm2 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
1230 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[3,0],ymm0[3,0],ymm1[7,4],ymm0[7,4]
1231 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm1
1232 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
1233 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
1234 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
1235 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
1236 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1237 ; AVX-NEXT: vmovaps %ymm1, (%rsi)
1238 ; AVX-NEXT: vmovaps %ymm5, (%rdx)
1239 ; AVX-NEXT: vmovaps %ymm14, (%rcx)
1240 ; AVX-NEXT: vmovaps %ymm9, (%r8)
1241 ; AVX-NEXT: vmovaps %ymm12, (%r9)
1242 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
1243 ; AVX-NEXT: vmovaps %ymm8, (%rax)
1244 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
1245 ; AVX-NEXT: vmovaps %ymm4, (%rax)
1246 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
1247 ; AVX-NEXT: vmovaps %ymm0, (%rax)
1248 ; AVX-NEXT: vzeroupper
1251 ; AVX2-LABEL: load_i32_stride8_vf8:
1253 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm0
1254 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm1
1255 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm2
1256 ; AVX2-NEXT: vmovaps (%rdi), %ymm3
1257 ; AVX2-NEXT: vmovaps 160(%rdi), %xmm7
1258 ; AVX2-NEXT: vmovaps 128(%rdi), %xmm11
1259 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm7[0],xmm11[1],xmm7[1]
1260 ; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
1261 ; AVX2-NEXT: vmovaps 224(%rdi), %xmm8
1262 ; AVX2-NEXT: vbroadcastss %xmm8, %xmm5
1263 ; AVX2-NEXT: vmovaps 192(%rdi), %xmm10
1264 ; AVX2-NEXT: vbroadcastss %xmm10, %xmm6
1265 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1266 ; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
1267 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
1268 ; AVX2-NEXT: vmovaps 96(%rdi), %xmm9
1269 ; AVX2-NEXT: vbroadcastss %xmm9, %xmm5
1270 ; AVX2-NEXT: vmovaps (%rdi), %xmm13
1271 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm14
1272 ; AVX2-NEXT: vmovaps 64(%rdi), %xmm12
1273 ; AVX2-NEXT: vbroadcastss %xmm12, %xmm6
1274 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1275 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm14[0],xmm13[1],xmm14[1]
1276 ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
1277 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1278 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm5 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
1279 ; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm13[1,1,1,1]
1280 ; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm14[1],xmm6[2,3]
1281 ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
1282 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
1283 ; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
1284 ; AVX2-NEXT: vshufps {{.*#+}} xmm15 = xmm11[1,1,1,1]
1285 ; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3]
1286 ; AVX2-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
1287 ; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm6[6,7]
1288 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm6
1289 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7]
1290 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm15 = xmm11[2],xmm7[2],xmm11[3],xmm7[3]
1291 ; AVX2-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm7
1292 ; AVX2-NEXT: vshufps {{.*#+}} xmm11 = xmm8[2,2,2,2]
1293 ; AVX2-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3]
1294 ; AVX2-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
1295 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7]
1296 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm11
1297 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm14[2],xmm13[3],xmm14[3]
1298 ; AVX2-NEXT: vshufps {{.*#+}} xmm13 = xmm9[2,2,2,2]
1299 ; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0,1,2],xmm13[3]
1300 ; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
1301 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7]
1302 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm13
1303 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3]
1304 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm10
1305 ; AVX2-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
1306 ; AVX2-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,3,2,3]
1307 ; AVX2-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
1308 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7]
1309 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm9 = xmm12[2],xmm9[2],xmm12[3],xmm9[3]
1310 ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm14[1],xmm9[1]
1311 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1312 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5]
1313 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
1314 ; AVX2-NEXT: vextractf128 $1, %ymm12, %xmm12
1315 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
1316 ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2]
1317 ; AVX2-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,3]
1318 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm15 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5]
1319 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm15[0],ymm9[2],ymm15[2]
1320 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
1321 ; AVX2-NEXT: vbroadcastss 148(%rdi), %ymm12
1322 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7]
1323 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7]
1324 ; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm14
1325 ; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5]
1326 ; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7]
1327 ; AVX2-NEXT: vextractf128 $1, %ymm15, %xmm15
1328 ; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
1329 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7]
1330 ; AVX2-NEXT: vbroadcastss 248(%rdi), %ymm14
1331 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm14[7]
1332 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[6],ymm13[6],ymm10[7],ymm13[7]
1333 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
1334 ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm3
1335 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
1336 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2]
1337 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
1338 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm11[6,7]
1339 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
1340 ; AVX2-NEXT: vbroadcastss 220(%rdi), %ymm3
1341 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7]
1342 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3]
1343 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
1344 ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7]
1345 ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2
1346 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
1347 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
1348 ; AVX2-NEXT: vmovaps %ymm4, (%rsi)
1349 ; AVX2-NEXT: vmovaps %ymm5, (%rdx)
1350 ; AVX2-NEXT: vmovaps %ymm7, (%rcx)
1351 ; AVX2-NEXT: vmovaps %ymm8, (%r8)
1352 ; AVX2-NEXT: vmovaps %ymm9, (%r9)
1353 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
1354 ; AVX2-NEXT: vmovaps %ymm12, (%rax)
1355 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
1356 ; AVX2-NEXT: vmovaps %ymm1, (%rax)
1357 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
1358 ; AVX2-NEXT: vmovaps %ymm0, (%rax)
1359 ; AVX2-NEXT: vzeroupper
1362 ; AVX2-FP-LABEL: load_i32_stride8_vf8:
1364 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm0
1365 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm1
1366 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm2
1367 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm3
1368 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm7
1369 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm11
1370 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm7[0],xmm11[1],xmm7[1]
1371 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
1372 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm8
1373 ; AVX2-FP-NEXT: vbroadcastss %xmm8, %xmm5
1374 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm10
1375 ; AVX2-FP-NEXT: vbroadcastss %xmm10, %xmm6
1376 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1377 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
1378 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
1379 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm9
1380 ; AVX2-FP-NEXT: vbroadcastss %xmm9, %xmm5
1381 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm13
1382 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm14
1383 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm12
1384 ; AVX2-FP-NEXT: vbroadcastss %xmm12, %xmm6
1385 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1386 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm14[0],xmm13[1],xmm14[1]
1387 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
1388 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1389 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
1390 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm13[1,1,1,1]
1391 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm14[1],xmm6[2,3]
1392 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
1393 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
1394 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
1395 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm15 = xmm11[1,1,1,1]
1396 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3]
1397 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
1398 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm6[6,7]
1399 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm6
1400 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7]
1401 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm15 = xmm11[2],xmm7[2],xmm11[3],xmm7[3]
1402 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm7
1403 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm11 = xmm8[2,2,2,2]
1404 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3]
1405 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
1406 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7]
1407 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm11
1408 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm14[2],xmm13[3],xmm14[3]
1409 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm13 = xmm9[2,2,2,2]
1410 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0,1,2],xmm13[3]
1411 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
1412 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7]
1413 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm13
1414 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3]
1415 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm10
1416 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
1417 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,3,2,3]
1418 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
1419 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7]
1420 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm9 = xmm12[2],xmm9[2],xmm12[3],xmm9[3]
1421 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm14[1],xmm9[1]
1422 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1423 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5]
1424 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
1425 ; AVX2-FP-NEXT: vextractf128 $1, %ymm12, %xmm12
1426 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
1427 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2]
1428 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,3]
1429 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5]
1430 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm15[0],ymm9[2],ymm15[2]
1431 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
1432 ; AVX2-FP-NEXT: vbroadcastss 148(%rdi), %ymm12
1433 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7]
1434 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7]
1435 ; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm14
1436 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5]
1437 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7]
1438 ; AVX2-FP-NEXT: vextractf128 $1, %ymm15, %xmm15
1439 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
1440 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7]
1441 ; AVX2-FP-NEXT: vbroadcastss 248(%rdi), %ymm14
1442 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm14[7]
1443 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[6],ymm13[6],ymm10[7],ymm13[7]
1444 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
1445 ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm3
1446 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
1447 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2]
1448 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
1449 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm11[6,7]
1450 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
1451 ; AVX2-FP-NEXT: vbroadcastss 220(%rdi), %ymm3
1452 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7]
1453 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3]
1454 ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0
1455 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7]
1456 ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2
1457 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
1458 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
1459 ; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi)
1460 ; AVX2-FP-NEXT: vmovaps %ymm5, (%rdx)
1461 ; AVX2-FP-NEXT: vmovaps %ymm7, (%rcx)
1462 ; AVX2-FP-NEXT: vmovaps %ymm8, (%r8)
1463 ; AVX2-FP-NEXT: vmovaps %ymm9, (%r9)
1464 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1465 ; AVX2-FP-NEXT: vmovaps %ymm12, (%rax)
1466 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1467 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rax)
1468 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1469 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rax)
1470 ; AVX2-FP-NEXT: vzeroupper
1471 ; AVX2-FP-NEXT: retq
1473 ; AVX2-FCP-LABEL: load_i32_stride8_vf8:
1474 ; AVX2-FCP: # %bb.0:
1475 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm0
1476 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm1
1477 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm2
1478 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm3
1479 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm7
1480 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm11
1481 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm4 = xmm11[0],xmm7[0],xmm11[1],xmm7[1]
1482 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
1483 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm8
1484 ; AVX2-FCP-NEXT: vbroadcastss %xmm8, %xmm5
1485 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm10
1486 ; AVX2-FCP-NEXT: vbroadcastss %xmm10, %xmm6
1487 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1488 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
1489 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
1490 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm9
1491 ; AVX2-FCP-NEXT: vbroadcastss %xmm9, %xmm5
1492 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm13
1493 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm14
1494 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm12
1495 ; AVX2-FCP-NEXT: vbroadcastss %xmm12, %xmm6
1496 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1497 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm13[0],xmm14[0],xmm13[1],xmm14[1]
1498 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
1499 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1500 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm5 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
1501 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm6 = xmm13[1,1,1,1]
1502 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0],xmm14[1],xmm6[2,3]
1503 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
1504 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm6 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
1505 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
1506 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm15 = xmm11[1,1,1,1]
1507 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm7[1],xmm15[2,3]
1508 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
1509 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0,1,2,3,4,5],ymm6[6,7]
1510 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm6
1511 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm15[4,5,6,7]
1512 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm15 = xmm11[2],xmm7[2],xmm11[3],xmm7[3]
1513 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm7
1514 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm11 = xmm8[2,2,2,2]
1515 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3]
1516 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
1517 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm11[6,7]
1518 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm11
1519 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm14 = xmm13[2],xmm14[2],xmm13[3],xmm14[3]
1520 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm13 = xmm9[2,2,2,2]
1521 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm13 = xmm12[0,1,2],xmm13[3]
1522 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
1523 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm13[0,1,2,3],ymm7[4,5,6,7]
1524 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm13
1525 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm8 = xmm10[2],xmm8[2],xmm10[3],xmm8[3]
1526 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm10
1527 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
1528 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm15 = xmm15[2,3,2,3]
1529 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
1530 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3,4,5],ymm8[6,7]
1531 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm9 = xmm12[2],xmm9[2],xmm12[3],xmm9[3]
1532 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm14[1],xmm9[1]
1533 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1534 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm10[0],ymm13[0],ymm10[1],ymm13[1],ymm10[4],ymm13[4],ymm10[5],ymm13[5]
1535 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
1536 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm12, %xmm12
1537 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
1538 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2]
1539 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm12 = xmm12[0,1],xmm15[2,3]
1540 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm15 = ymm11[0],ymm6[0],ymm11[1],ymm6[1],ymm11[4],ymm6[4],ymm11[5],ymm6[5]
1541 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm15[0],ymm9[2],ymm15[2]
1542 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
1543 ; AVX2-FCP-NEXT: vbroadcastss 148(%rdi), %ymm12
1544 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm13[5],ymm12[6,7]
1545 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm15[6,7]
1546 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm14
1547 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm3[1,1,1,1,5,5,5,5]
1548 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm2[1],ymm15[2,3,4],ymm2[5],ymm15[6,7]
1549 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm15, %xmm15
1550 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
1551 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm14[0,1,2,3],ymm12[4,5,6,7]
1552 ; AVX2-FCP-NEXT: vbroadcastss 248(%rdi), %ymm14
1553 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,6],ymm14[7]
1554 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm13[2],ymm10[3],ymm13[3],ymm10[6],ymm13[6],ymm10[7],ymm13[7]
1555 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[6],ymm2[6],ymm3[7],ymm2[7]
1556 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm3
1557 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
1558 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2]
1559 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
1560 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm11[6,7]
1561 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
1562 ; AVX2-FCP-NEXT: vbroadcastss 220(%rdi), %ymm3
1563 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm3[2],ymm6[2],ymm3[3],ymm6[3],ymm3[6],ymm6[6],ymm3[7],ymm6[7]
1564 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm10[1],ymm3[1],ymm10[3],ymm3[3]
1565 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
1566 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,3,2,3,6,7,6,7]
1567 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2
1568 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
1569 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
1570 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi)
1571 ; AVX2-FCP-NEXT: vmovaps %ymm5, (%rdx)
1572 ; AVX2-FCP-NEXT: vmovaps %ymm7, (%rcx)
1573 ; AVX2-FCP-NEXT: vmovaps %ymm8, (%r8)
1574 ; AVX2-FCP-NEXT: vmovaps %ymm9, (%r9)
1575 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1576 ; AVX2-FCP-NEXT: vmovaps %ymm12, (%rax)
1577 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1578 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rax)
1579 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1580 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax)
1581 ; AVX2-FCP-NEXT: vzeroupper
1582 ; AVX2-FCP-NEXT: retq
1584 ; AVX512-LABEL: load_i32_stride8_vf8:
1586 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
1587 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
1588 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
1589 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
1590 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1
1591 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm2
1592 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm3
1593 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24]
1594 ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
1595 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24]
1596 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm5
1597 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1598 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25]
1599 ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm5
1600 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25]
1601 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm6
1602 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
1603 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26]
1604 ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm6
1605 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26]
1606 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm7
1607 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
1608 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27]
1609 ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm7
1610 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27]
1611 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
1612 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
1613 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28]
1614 ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm8
1615 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28]
1616 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm9
1617 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1618 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29]
1619 ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm9
1620 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29]
1621 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm10
1622 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1623 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30]
1624 ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm10
1625 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30]
1626 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm11
1627 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
1628 ; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31]
1629 ; AVX512-NEXT: vpermi2d %zmm3, %zmm2, %zmm11
1630 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31]
1631 ; AVX512-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
1632 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
1633 ; AVX512-NEXT: vmovdqa %ymm4, (%rsi)
1634 ; AVX512-NEXT: vmovdqa %ymm5, (%rdx)
1635 ; AVX512-NEXT: vmovdqa %ymm6, (%rcx)
1636 ; AVX512-NEXT: vmovdqa %ymm7, (%r8)
1637 ; AVX512-NEXT: vmovdqa %ymm8, (%r9)
1638 ; AVX512-NEXT: vmovdqa %ymm9, (%r11)
1639 ; AVX512-NEXT: vmovdqa %ymm10, (%r10)
1640 ; AVX512-NEXT: vmovdqa %ymm0, (%rax)
1641 ; AVX512-NEXT: vzeroupper
1644 ; AVX512-FCP-LABEL: load_i32_stride8_vf8:
1645 ; AVX512-FCP: # %bb.0:
1646 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1647 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1648 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1649 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1650 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1651 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
1652 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
1653 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24]
1654 ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
1655 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24]
1656 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5
1657 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1658 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25]
1659 ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5
1660 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25]
1661 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6
1662 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
1663 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26]
1664 ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6
1665 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26]
1666 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7
1667 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
1668 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27]
1669 ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7
1670 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27]
1671 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
1672 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
1673 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28]
1674 ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8
1675 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28]
1676 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9
1677 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1678 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29]
1679 ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9
1680 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29]
1681 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10
1682 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1683 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30]
1684 ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10
1685 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30]
1686 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11
1687 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
1688 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31]
1689 ; AVX512-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11
1690 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31]
1691 ; AVX512-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
1692 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
1693 ; AVX512-FCP-NEXT: vmovdqa %ymm4, (%rsi)
1694 ; AVX512-FCP-NEXT: vmovdqa %ymm5, (%rdx)
1695 ; AVX512-FCP-NEXT: vmovdqa %ymm6, (%rcx)
1696 ; AVX512-FCP-NEXT: vmovdqa %ymm7, (%r8)
1697 ; AVX512-FCP-NEXT: vmovdqa %ymm8, (%r9)
1698 ; AVX512-FCP-NEXT: vmovdqa %ymm9, (%r11)
1699 ; AVX512-FCP-NEXT: vmovdqa %ymm10, (%r10)
1700 ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax)
1701 ; AVX512-FCP-NEXT: vzeroupper
1702 ; AVX512-FCP-NEXT: retq
1704 ; AVX512DQ-LABEL: load_i32_stride8_vf8:
1705 ; AVX512DQ: # %bb.0:
1706 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
1707 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
1708 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
1709 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
1710 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm1
1711 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm2
1712 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm3
1713 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24]
1714 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
1715 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24]
1716 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm5
1717 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1718 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25]
1719 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm5
1720 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25]
1721 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm6
1722 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
1723 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26]
1724 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm6
1725 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26]
1726 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm7
1727 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
1728 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27]
1729 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm7
1730 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27]
1731 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
1732 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
1733 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28]
1734 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm8
1735 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28]
1736 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm9
1737 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1738 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29]
1739 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm9
1740 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29]
1741 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm10
1742 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1743 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30]
1744 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm10
1745 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30]
1746 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm11
1747 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
1748 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31]
1749 ; AVX512DQ-NEXT: vpermi2d %zmm3, %zmm2, %zmm11
1750 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31]
1751 ; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
1752 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
1753 ; AVX512DQ-NEXT: vmovdqa %ymm4, (%rsi)
1754 ; AVX512DQ-NEXT: vmovdqa %ymm5, (%rdx)
1755 ; AVX512DQ-NEXT: vmovdqa %ymm6, (%rcx)
1756 ; AVX512DQ-NEXT: vmovdqa %ymm7, (%r8)
1757 ; AVX512DQ-NEXT: vmovdqa %ymm8, (%r9)
1758 ; AVX512DQ-NEXT: vmovdqa %ymm9, (%r11)
1759 ; AVX512DQ-NEXT: vmovdqa %ymm10, (%r10)
1760 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax)
1761 ; AVX512DQ-NEXT: vzeroupper
1762 ; AVX512DQ-NEXT: retq
1764 ; AVX512DQ-FCP-LABEL: load_i32_stride8_vf8:
1765 ; AVX512DQ-FCP: # %bb.0:
1766 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1767 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1768 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1769 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1770 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1771 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
1772 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
1773 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24]
1774 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
1775 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24]
1776 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5
1777 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1778 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25]
1779 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5
1780 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25]
1781 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6
1782 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
1783 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26]
1784 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6
1785 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26]
1786 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7
1787 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
1788 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27]
1789 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7
1790 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27]
1791 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
1792 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
1793 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28]
1794 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8
1795 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28]
1796 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9
1797 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1798 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29]
1799 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9
1800 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29]
1801 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10
1802 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1803 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30]
1804 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10
1805 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30]
1806 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11
1807 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
1808 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31]
1809 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11
1810 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31]
1811 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
1812 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
1813 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm4, (%rsi)
1814 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm5, (%rdx)
1815 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm6, (%rcx)
1816 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%r8)
1817 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm8, (%r9)
1818 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm9, (%r11)
1819 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm10, (%r10)
1820 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax)
1821 ; AVX512DQ-FCP-NEXT: vzeroupper
1822 ; AVX512DQ-FCP-NEXT: retq
1824 ; AVX512BW-LABEL: load_i32_stride8_vf8:
1825 ; AVX512BW: # %bb.0:
1826 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1827 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
1828 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
1829 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1830 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1831 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2
1832 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3
1833 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24]
1834 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
1835 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24]
1836 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5
1837 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1838 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25]
1839 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5
1840 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25]
1841 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6
1842 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
1843 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26]
1844 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6
1845 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26]
1846 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7
1847 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
1848 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27]
1849 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7
1850 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27]
1851 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
1852 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
1853 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28]
1854 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8
1855 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28]
1856 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9
1857 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1858 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29]
1859 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9
1860 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29]
1861 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10
1862 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1863 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30]
1864 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10
1865 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30]
1866 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm11
1867 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
1868 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31]
1869 ; AVX512BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm11
1870 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31]
1871 ; AVX512BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
1872 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
1873 ; AVX512BW-NEXT: vmovdqa %ymm4, (%rsi)
1874 ; AVX512BW-NEXT: vmovdqa %ymm5, (%rdx)
1875 ; AVX512BW-NEXT: vmovdqa %ymm6, (%rcx)
1876 ; AVX512BW-NEXT: vmovdqa %ymm7, (%r8)
1877 ; AVX512BW-NEXT: vmovdqa %ymm8, (%r9)
1878 ; AVX512BW-NEXT: vmovdqa %ymm9, (%r11)
1879 ; AVX512BW-NEXT: vmovdqa %ymm10, (%r10)
1880 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rax)
1881 ; AVX512BW-NEXT: vzeroupper
1882 ; AVX512BW-NEXT: retq
1884 ; AVX512BW-FCP-LABEL: load_i32_stride8_vf8:
1885 ; AVX512BW-FCP: # %bb.0:
1886 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1887 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1888 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1889 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
1890 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
1891 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
1892 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
1893 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24]
1894 ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
1895 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24]
1896 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5
1897 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1898 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25]
1899 ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5
1900 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25]
1901 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6
1902 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
1903 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26]
1904 ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6
1905 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26]
1906 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7
1907 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
1908 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27]
1909 ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7
1910 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27]
1911 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
1912 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
1913 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28]
1914 ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8
1915 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28]
1916 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9
1917 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1918 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29]
1919 ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9
1920 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29]
1921 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10
1922 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1923 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30]
1924 ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10
1925 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30]
1926 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11
1927 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
1928 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31]
1929 ; AVX512BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11
1930 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31]
1931 ; AVX512BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
1932 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
1933 ; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rsi)
1934 ; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%rdx)
1935 ; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%rcx)
1936 ; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%r8)
1937 ; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%r9)
1938 ; AVX512BW-FCP-NEXT: vmovdqa %ymm9, (%r11)
1939 ; AVX512BW-FCP-NEXT: vmovdqa %ymm10, (%r10)
1940 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
1941 ; AVX512BW-FCP-NEXT: vzeroupper
1942 ; AVX512BW-FCP-NEXT: retq
1944 ; AVX512DQ-BW-LABEL: load_i32_stride8_vf8:
1945 ; AVX512DQ-BW: # %bb.0:
1946 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1947 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
1948 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
1949 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
1950 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
1951 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2
1952 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3
1953 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24]
1954 ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
1955 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24]
1956 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm5
1957 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
1958 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25]
1959 ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm5
1960 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25]
1961 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm6
1962 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
1963 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26]
1964 ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm6
1965 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26]
1966 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm7
1967 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
1968 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27]
1969 ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm7
1970 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27]
1971 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
1972 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
1973 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28]
1974 ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm8
1975 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28]
1976 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm9
1977 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
1978 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29]
1979 ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm9
1980 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29]
1981 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm10
1982 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
1983 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30]
1984 ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm10
1985 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30]
1986 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm11
1987 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
1988 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31]
1989 ; AVX512DQ-BW-NEXT: vpermi2d %zmm3, %zmm2, %zmm11
1990 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31]
1991 ; AVX512DQ-BW-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
1992 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
1993 ; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rsi)
1994 ; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%rdx)
1995 ; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%rcx)
1996 ; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%r8)
1997 ; AVX512DQ-BW-NEXT: vmovdqa %ymm8, (%r9)
1998 ; AVX512DQ-BW-NEXT: vmovdqa %ymm9, (%r11)
1999 ; AVX512DQ-BW-NEXT: vmovdqa %ymm10, (%r10)
2000 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax)
2001 ; AVX512DQ-BW-NEXT: vzeroupper
2002 ; AVX512DQ-BW-NEXT: retq
2004 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride8_vf8:
2005 ; AVX512DQ-BW-FCP: # %bb.0:
2006 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2007 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
2008 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
2009 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
2010 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
2011 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
2012 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
2013 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,8,16,24]
2014 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm4
2015 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,8,16,24]
2016 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm5
2017 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
2018 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,9,17,25]
2019 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm5
2020 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm6 = [1,9,17,25]
2021 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm6
2022 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
2023 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [0,0,0,0,2,10,18,26]
2024 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm6
2025 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [2,10,18,26]
2026 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm7
2027 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
2028 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,0,0,0,3,11,19,27]
2029 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm7
2030 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm8 = [3,11,19,27]
2031 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm8
2032 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
2033 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,4,12,20,28]
2034 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm8
2035 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [4,12,20,28]
2036 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm9
2037 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
2038 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,5,13,21,29]
2039 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm9
2040 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [5,13,21,29]
2041 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm10
2042 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
2043 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,6,14,22,30]
2044 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm10
2045 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [6,14,22,30]
2046 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm11
2047 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
2048 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,0,0,0,7,15,23,31]
2049 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm3, %zmm2, %zmm11
2050 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} xmm2 = [7,15,23,31]
2051 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
2052 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
2053 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rsi)
2054 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%rdx)
2055 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%rcx)
2056 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%r8)
2057 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%r9)
2058 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm9, (%r11)
2059 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm10, (%r10)
2060 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
2061 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
2062 ; AVX512DQ-BW-FCP-NEXT: retq
2063 %wide.vec = load <64 x i32>, ptr %in.vec, align 64
2064 %strided.vec0 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
2065 %strided.vec1 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57>
2066 %strided.vec2 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58>
2067 %strided.vec3 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59>
2068 %strided.vec4 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60>
2069 %strided.vec5 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61>
2070 %strided.vec6 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62>
2071 %strided.vec7 = shufflevector <64 x i32> %wide.vec, <64 x i32> poison, <8 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63>
2072 store <8 x i32> %strided.vec0, ptr %out.vec0, align 64
2073 store <8 x i32> %strided.vec1, ptr %out.vec1, align 64
2074 store <8 x i32> %strided.vec2, ptr %out.vec2, align 64
2075 store <8 x i32> %strided.vec3, ptr %out.vec3, align 64
2076 store <8 x i32> %strided.vec4, ptr %out.vec4, align 64
2077 store <8 x i32> %strided.vec5, ptr %out.vec5, align 64
2078 store <8 x i32> %strided.vec6, ptr %out.vec6, align 64
2079 store <8 x i32> %strided.vec7, ptr %out.vec7, align 64
2083 define void @load_i32_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
2084 ; SSE-LABEL: load_i32_stride8_vf16:
2086 ; SSE-NEXT: subq $296, %rsp # imm = 0x128
2087 ; SSE-NEXT: movaps 288(%rdi), %xmm6
2088 ; SSE-NEXT: movaps 352(%rdi), %xmm0
2089 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2090 ; SSE-NEXT: movaps 320(%rdi), %xmm5
2091 ; SSE-NEXT: movaps 416(%rdi), %xmm2
2092 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2093 ; SSE-NEXT: movaps 384(%rdi), %xmm12
2094 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2095 ; SSE-NEXT: movaps 480(%rdi), %xmm13
2096 ; SSE-NEXT: movaps 448(%rdi), %xmm4
2097 ; SSE-NEXT: movaps 160(%rdi), %xmm7
2098 ; SSE-NEXT: movaps 128(%rdi), %xmm10
2099 ; SSE-NEXT: movaps 224(%rdi), %xmm8
2100 ; SSE-NEXT: movaps 192(%rdi), %xmm3
2101 ; SSE-NEXT: movaps %xmm3, %xmm9
2102 ; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
2103 ; SSE-NEXT: movaps %xmm10, %xmm11
2104 ; SSE-NEXT: movaps %xmm10, %xmm14
2105 ; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm7[0],xmm11[1],xmm7[1]
2106 ; SSE-NEXT: movaps %xmm11, %xmm10
2107 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm9[0]
2108 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2109 ; SSE-NEXT: movaps %xmm4, %xmm10
2110 ; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
2111 ; SSE-NEXT: unpcklps {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1]
2112 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm9[1]
2113 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2114 ; SSE-NEXT: movaps %xmm12, %xmm9
2115 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm10[0]
2116 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2117 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm10[1]
2118 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2119 ; SSE-NEXT: movaps %xmm5, %xmm9
2120 ; SSE-NEXT: unpcklps {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
2121 ; SSE-NEXT: movaps 256(%rdi), %xmm15
2122 ; SSE-NEXT: movaps %xmm15, %xmm0
2123 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
2124 ; SSE-NEXT: movaps %xmm0, %xmm10
2125 ; SSE-NEXT: movlhps {{.*#+}} xmm10 = xmm10[0],xmm9[0]
2126 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2127 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1]
2128 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2129 ; SSE-NEXT: movaps 96(%rdi), %xmm10
2130 ; SSE-NEXT: movaps 64(%rdi), %xmm9
2131 ; SSE-NEXT: movaps %xmm9, %xmm11
2132 ; SSE-NEXT: unpcklps {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
2133 ; SSE-NEXT: movaps (%rdi), %xmm2
2134 ; SSE-NEXT: movaps 32(%rdi), %xmm12
2135 ; SSE-NEXT: movaps %xmm2, %xmm1
2136 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1]
2137 ; SSE-NEXT: movaps %xmm1, %xmm0
2138 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm11[0]
2139 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2140 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm11[1]
2141 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2142 ; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
2143 ; SSE-NEXT: unpckhps {{.*#+}} xmm14 = xmm14[2],xmm7[2],xmm14[3],xmm7[3]
2144 ; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3]
2145 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
2146 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
2147 ; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3]
2148 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2149 ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
2150 ; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm6[2],xmm15[3],xmm6[3]
2151 ; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm10[2],xmm9[3],xmm10[3]
2152 ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm12[2],xmm2[3],xmm12[3]
2153 ; SSE-NEXT: movaps %xmm14, %xmm0
2154 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
2155 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2156 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm3[1]
2157 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2158 ; SSE-NEXT: movaps %xmm13, %xmm0
2159 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm4[0]
2160 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2161 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm4[1]
2162 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2163 ; SSE-NEXT: movaps %xmm15, %xmm0
2164 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0]
2165 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2166 ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm5[1]
2167 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2168 ; SSE-NEXT: movaps %xmm2, %xmm0
2169 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0]
2170 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2171 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm9[1]
2172 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2173 ; SSE-NEXT: movaps 240(%rdi), %xmm1
2174 ; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
2175 ; SSE-NEXT: movaps 208(%rdi), %xmm15
2176 ; SSE-NEXT: movaps %xmm15, %xmm0
2177 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2178 ; SSE-NEXT: movaps 176(%rdi), %xmm2
2179 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2180 ; SSE-NEXT: movaps 144(%rdi), %xmm1
2181 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2182 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
2183 ; SSE-NEXT: movaps %xmm1, %xmm2
2184 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2185 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2186 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
2187 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2188 ; SSE-NEXT: movaps 496(%rdi), %xmm1
2189 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2190 ; SSE-NEXT: movaps 464(%rdi), %xmm5
2191 ; SSE-NEXT: movaps %xmm5, %xmm0
2192 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2193 ; SSE-NEXT: movaps 432(%rdi), %xmm1
2194 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2195 ; SSE-NEXT: movaps 400(%rdi), %xmm6
2196 ; SSE-NEXT: movaps %xmm6, %xmm10
2197 ; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1]
2198 ; SSE-NEXT: movaps %xmm10, %xmm1
2199 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2200 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2201 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1]
2202 ; SSE-NEXT: movaps 368(%rdi), %xmm14
2203 ; SSE-NEXT: movaps 336(%rdi), %xmm2
2204 ; SSE-NEXT: movaps %xmm2, %xmm0
2205 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1]
2206 ; SSE-NEXT: movaps 304(%rdi), %xmm12
2207 ; SSE-NEXT: movaps 272(%rdi), %xmm7
2208 ; SSE-NEXT: movaps %xmm7, %xmm4
2209 ; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1]
2210 ; SSE-NEXT: movaps %xmm4, %xmm1
2211 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2212 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2213 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
2214 ; SSE-NEXT: movaps 112(%rdi), %xmm13
2215 ; SSE-NEXT: movaps 80(%rdi), %xmm1
2216 ; SSE-NEXT: movaps %xmm1, %xmm0
2217 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
2218 ; SSE-NEXT: movaps 16(%rdi), %xmm8
2219 ; SSE-NEXT: movaps 48(%rdi), %xmm11
2220 ; SSE-NEXT: movaps %xmm8, %xmm3
2221 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1]
2222 ; SSE-NEXT: movaps %xmm3, %xmm9
2223 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0]
2224 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2225 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
2226 ; SSE-NEXT: unpckhps (%rsp), %xmm15 # 16-byte Folded Reload
2227 ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
2228 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2229 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2230 ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
2231 ; SSE-NEXT: unpckhps {{.*#+}} xmm2 = xmm2[2],xmm14[2],xmm2[3],xmm14[3]
2232 ; SSE-NEXT: unpckhps {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3]
2233 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2234 ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
2235 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
2236 ; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3]
2237 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3]
2238 ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm11[2],xmm8[3],xmm11[3]
2239 ; SSE-NEXT: movaps %xmm0, %xmm11
2240 ; SSE-NEXT: movlhps {{.*#+}} xmm11 = xmm11[0],xmm15[0]
2241 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm15[1]
2242 ; SSE-NEXT: movaps %xmm0, %xmm12
2243 ; SSE-NEXT: movaps %xmm7, %xmm9
2244 ; SSE-NEXT: movlhps {{.*#+}} xmm9 = xmm9[0],xmm2[0]
2245 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1]
2246 ; SSE-NEXT: movaps %xmm6, %xmm0
2247 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm5[0]
2248 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm5[1]
2249 ; SSE-NEXT: movaps %xmm8, %xmm2
2250 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
2251 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm1[1]
2252 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2253 ; SSE-NEXT: movaps %xmm1, 32(%rsi)
2254 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2255 ; SSE-NEXT: movaps %xmm1, 48(%rsi)
2256 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2257 ; SSE-NEXT: movaps %xmm1, (%rsi)
2258 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2259 ; SSE-NEXT: movaps %xmm1, 16(%rsi)
2260 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2261 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
2262 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2263 ; SSE-NEXT: movaps %xmm1, 48(%rdx)
2264 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2265 ; SSE-NEXT: movaps %xmm1, (%rdx)
2266 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2267 ; SSE-NEXT: movaps %xmm1, 16(%rdx)
2268 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2269 ; SSE-NEXT: movaps %xmm1, 32(%rcx)
2270 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2271 ; SSE-NEXT: movaps %xmm1, 48(%rcx)
2272 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2273 ; SSE-NEXT: movaps %xmm1, (%rcx)
2274 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2275 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
2276 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2277 ; SSE-NEXT: movaps %xmm1, 32(%r8)
2278 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2279 ; SSE-NEXT: movaps %xmm1, 48(%r8)
2280 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2281 ; SSE-NEXT: movaps %xmm1, (%r8)
2282 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2283 ; SSE-NEXT: movaps %xmm1, 16(%r8)
2284 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2285 ; SSE-NEXT: movaps %xmm1, 32(%r9)
2286 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2287 ; SSE-NEXT: movaps %xmm1, 48(%r9)
2288 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2289 ; SSE-NEXT: movaps %xmm1, (%r9)
2290 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2291 ; SSE-NEXT: movaps %xmm1, 16(%r9)
2292 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
2293 ; SSE-NEXT: movaps %xmm4, 32(%rax)
2294 ; SSE-NEXT: movaps %xmm10, 48(%rax)
2295 ; SSE-NEXT: movaps %xmm3, (%rax)
2296 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2297 ; SSE-NEXT: movaps %xmm1, 16(%rax)
2298 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
2299 ; SSE-NEXT: movaps %xmm0, 48(%rax)
2300 ; SSE-NEXT: movaps %xmm9, 32(%rax)
2301 ; SSE-NEXT: movaps %xmm11, 16(%rax)
2302 ; SSE-NEXT: movaps %xmm2, (%rax)
2303 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
2304 ; SSE-NEXT: movaps %xmm6, 48(%rax)
2305 ; SSE-NEXT: movaps %xmm7, 32(%rax)
2306 ; SSE-NEXT: movaps %xmm12, 16(%rax)
2307 ; SSE-NEXT: movaps %xmm8, (%rax)
2308 ; SSE-NEXT: addq $296, %rsp # imm = 0x128
2311 ; AVX-LABEL: load_i32_stride8_vf16:
2313 ; AVX-NEXT: subq $616, %rsp # imm = 0x268
2314 ; AVX-NEXT: vmovaps 32(%rdi), %xmm14
2315 ; AVX-NEXT: vmovaps (%rdi), %xmm9
2316 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm14[0],xmm9[1],xmm14[1]
2317 ; AVX-NEXT: vmovaps 96(%rdi), %xmm1
2318 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2319 ; AVX-NEXT: vmovaps 64(%rdi), %xmm2
2320 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2321 ; AVX-NEXT: vunpcklps {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2322 ; AVX-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm8[0]
2323 ; AVX-NEXT: vmovaps 160(%rdi), %xmm5
2324 ; AVX-NEXT: vmovaps 128(%rdi), %xmm10
2325 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm5[0],xmm10[1],xmm5[1]
2326 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6
2327 ; AVX-NEXT: vmovaps 224(%rdi), %xmm0
2328 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2329 ; AVX-NEXT: vmovaps 192(%rdi), %xmm1
2330 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2331 ; AVX-NEXT: vunpcklps {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2332 ; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm4[0,1,0,1]
2333 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
2334 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
2335 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm6[4,5,6,7]
2336 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2337 ; AVX-NEXT: vmovaps 416(%rdi), %xmm11
2338 ; AVX-NEXT: vmovaps 384(%rdi), %xmm12
2339 ; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
2340 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
2341 ; AVX-NEXT: vmovaps 480(%rdi), %xmm0
2342 ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
2343 ; AVX-NEXT: vmovaps 448(%rdi), %xmm1
2344 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2345 ; AVX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2346 ; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm3[0,1,0,1]
2347 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
2348 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm6[6,7]
2349 ; AVX-NEXT: vmovaps 288(%rdi), %xmm13
2350 ; AVX-NEXT: vmovaps 256(%rdi), %xmm15
2351 ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm13[0],xmm15[1],xmm13[1]
2352 ; AVX-NEXT: vmovaps 352(%rdi), %xmm7
2353 ; AVX-NEXT: vmovaps 320(%rdi), %xmm6
2354 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
2355 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2356 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
2357 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2358 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm9[1,1,1,1]
2359 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm14[1],xmm1[2,3]
2360 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3]
2361 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm2
2362 ; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,1,1,1]
2363 ; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3]
2364 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
2365 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
2366 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
2367 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2368 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm15[1,1,1,1]
2369 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0],xmm13[1],xmm1[2,3]
2370 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2371 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1
2372 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm12[1,1,1,1]
2373 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm11[1],xmm2[2,3]
2374 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
2375 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2376 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2377 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2378 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm9[2],xmm14[2],xmm9[3],xmm14[3]
2379 ; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm5[2],xmm10[3],xmm5[3]
2380 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2381 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm14[2,2,2,2]
2382 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
2383 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm10[0,1,2],xmm2[3]
2384 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
2385 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3
2386 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
2387 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
2388 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm9[2,2,2,2]
2389 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2390 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm8[0,1,2],xmm3[3]
2391 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3]
2392 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
2393 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2394 ; AVX-NEXT: vunpckhps {{.*#+}} xmm2 = xmm15[2],xmm13[2],xmm15[3],xmm13[3]
2395 ; AVX-NEXT: vunpckhps {{.*#+}} xmm3 = xmm12[2],xmm11[2],xmm12[3],xmm11[3]
2396 ; AVX-NEXT: vmovaps (%rsp), %xmm15 # 16-byte Reload
2397 ; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm15[2,2,2,2]
2398 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
2399 ; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm11[0,1,2],xmm4[3]
2400 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
2401 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm5
2402 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
2403 ; AVX-NEXT: vshufps {{.*#+}} xmm5 = xmm7[2,2,2,2]
2404 ; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3]
2405 ; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm2[0,1],xmm5[2,3]
2406 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
2407 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2408 ; AVX-NEXT: vunpckhps {{.*#+}} xmm4 = xmm8[2],xmm9[2],xmm8[3],xmm9[3]
2409 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm4[1]
2410 ; AVX-NEXT: vunpckhps {{.*#+}} xmm4 = xmm10[2],xmm14[2],xmm10[3],xmm14[3]
2411 ; AVX-NEXT: vmovaps 288(%rdi), %ymm5
2412 ; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2413 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
2414 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[2,3,2,3]
2415 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2416 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
2417 ; AVX-NEXT: vmovaps 256(%rdi), %ymm4
2418 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2419 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2420 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2421 ; AVX-NEXT: vmovaps 416(%rdi), %ymm8
2422 ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2423 ; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
2424 ; AVX-NEXT: vmovaps 384(%rdi), %ymm6
2425 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm2[1],xmm0[1]
2426 ; AVX-NEXT: vmovaps 448(%rdi), %ymm7
2427 ; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2428 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm15[2],xmm11[3],xmm15[3]
2429 ; AVX-NEXT: vmovaps 480(%rdi), %ymm9
2430 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
2431 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm3[2,3,2,3]
2432 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
2433 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2434 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2435 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2436 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm9[0],ymm7[0],ymm9[2],ymm7[2]
2437 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm6[0],ymm8[0],ymm6[1],ymm8[1],ymm6[4],ymm8[4],ymm6[5],ymm8[5]
2438 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2439 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
2440 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[4],ymm5[4],ymm4[5],ymm5[5]
2441 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
2442 ; AVX-NEXT: vmovaps 320(%rdi), %ymm10
2443 ; AVX-NEXT: vmovaps 352(%rdi), %ymm11
2444 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm11[0],ymm10[0],ymm11[2],ymm10[2]
2445 ; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2446 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
2447 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
2448 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
2449 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2450 ; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
2451 ; AVX-NEXT: vmovaps 160(%rdi), %ymm7
2452 ; AVX-NEXT: vmovaps 128(%rdi), %ymm5
2453 ; AVX-NEXT: vmovaps 192(%rdi), %ymm1
2454 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2455 ; AVX-NEXT: vmovaps 224(%rdi), %ymm0
2456 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2457 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
2458 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm5[0],ymm7[0],ymm5[1],ymm7[1],ymm5[4],ymm7[4],ymm5[5],ymm7[5]
2459 ; AVX-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2460 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
2461 ; AVX-NEXT: vmovaps 64(%rdi), %ymm0
2462 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2463 ; AVX-NEXT: vmovaps 96(%rdi), %ymm14
2464 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm14[0],ymm0[0],ymm14[2],ymm0[2]
2465 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
2466 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
2467 ; AVX-NEXT: vmovaps (%rdi), %ymm13
2468 ; AVX-NEXT: vmovaps 32(%rdi), %ymm12
2469 ; AVX-NEXT: vunpcklps {{.*#+}} ymm15 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[4],ymm12[4],ymm13[5],ymm12[5]
2470 ; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15
2471 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
2472 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
2473 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2474 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2475 ; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm9[0],ymm4[1],ymm9[1],ymm4[4],ymm9[4],ymm4[5],ymm9[5]
2476 ; AVX-NEXT: vmovaps %ymm9, %ymm8
2477 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
2478 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm9[1,0],ymm6[1,0],ymm9[5,4],ymm6[5,4]
2479 ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7]
2480 ; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm10[0],ymm11[0],ymm10[1],ymm11[1],ymm10[4],ymm11[4],ymm10[5],ymm11[5]
2481 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
2482 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2483 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2484 ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm0[1,0],ymm1[1,0],ymm0[5,4],ymm1[5,4]
2485 ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7]
2486 ; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15
2487 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3]
2488 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm6[4,5,6,7]
2489 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2490 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2491 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
2492 ; AVX-NEXT: vunpcklps {{.*#+}} ymm15 = ymm3[0],ymm6[0],ymm3[1],ymm6[1],ymm3[4],ymm6[4],ymm3[5],ymm6[5]
2493 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm7[1,0],ymm5[1,0],ymm7[5,4],ymm5[5,4]
2494 ; AVX-NEXT: vmovaps %ymm7, %ymm11
2495 ; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm2[2,0],ymm15[2,3],ymm2[6,4],ymm15[6,7]
2496 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
2497 ; AVX-NEXT: vunpcklps {{.*#+}} ymm2 = ymm5[0],ymm14[0],ymm5[1],ymm14[1],ymm5[4],ymm14[4],ymm5[5],ymm14[5]
2498 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
2499 ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm12[1,0],ymm13[1,0],ymm12[5,4],ymm13[5,4]
2500 ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7]
2501 ; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15
2502 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3]
2503 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7]
2504 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2505 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm7 = ymm8[1],ymm4[1],ymm8[3],ymm4[3]
2506 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2507 ; AVX-NEXT: vunpckhps {{.*#+}} ymm2 = ymm4[2],ymm9[2],ymm4[3],ymm9[3],ymm4[6],ymm9[6],ymm4[7],ymm9[7]
2508 ; AVX-NEXT: vshufps {{.*#+}} ymm9 = ymm2[0,1],ymm7[2,0],ymm2[4,5],ymm7[6,4]
2509 ; AVX-NEXT: vunpckhps {{.*#+}} ymm2 = ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[6],ymm0[6],ymm1[7],ymm0[7]
2510 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
2511 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
2512 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm7[1],ymm10[1],ymm7[3],ymm10[3]
2513 ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
2514 ; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15
2515 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm15[2,3]
2516 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm9[4,5,6,7]
2517 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2518 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm3[1],ymm6[3],ymm3[3]
2519 ; AVX-NEXT: vmovaps %ymm3, %ymm9
2520 ; AVX-NEXT: vmovaps %ymm11, %ymm1
2521 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2522 ; AVX-NEXT: vunpckhps {{.*#+}} ymm2 = ymm3[2],ymm11[2],ymm3[3],ymm11[3],ymm3[6],ymm11[6],ymm3[7],ymm11[7]
2523 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,1],ymm0[2,0],ymm2[4,5],ymm0[6,4]
2524 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm14[1],ymm5[1],ymm14[3],ymm5[3]
2525 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
2526 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
2527 ; AVX-NEXT: vunpckhps {{.*#+}} ymm15 = ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[6],ymm12[6],ymm13[7],ymm12[7]
2528 ; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15
2529 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm15[0,1],xmm2[2,3]
2530 ; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm2[0,1,2,3],ymm0[4,5,6,7]
2531 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2532 ; AVX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7]
2533 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2534 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm4[3,0],ymm2[7,4],ymm4[7,4]
2535 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm2[2,0],ymm0[2,3],ymm2[6,4],ymm0[6,7]
2536 ; AVX-NEXT: vunpckhps {{.*#+}} ymm2 = ymm10[2],ymm7[2],ymm10[3],ymm7[3],ymm10[6],ymm7[6],ymm10[7],ymm7[7]
2537 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2538 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm7 # 32-byte Folded Reload
2539 ; AVX-NEXT: # ymm7 = ymm4[3,0],mem[3,0],ymm4[7,4],mem[7,4]
2540 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
2541 ; AVX-NEXT: vshufps {{.*#+}} ymm7 = ymm7[2,0,2,3,6,4,6,7]
2542 ; AVX-NEXT: vextractf128 $1, %ymm7, %xmm7
2543 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3]
2544 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
2545 ; AVX-NEXT: vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7]
2546 ; AVX-NEXT: vshufps {{.*#+}} ymm6 = ymm1[3,0],ymm3[3,0],ymm1[7,4],ymm3[7,4]
2547 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm6[2,0],ymm2[2,3],ymm6[6,4],ymm2[6,7]
2548 ; AVX-NEXT: vunpckhps {{.*#+}} ymm4 = ymm5[2],ymm14[2],ymm5[3],ymm14[3],ymm5[6],ymm14[6],ymm5[7],ymm14[7]
2549 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm12[3,0],ymm13[3,0],ymm12[7,4],ymm13[7,4]
2550 ; AVX-NEXT: vextractf128 $1, %ymm4, %xmm3
2551 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[2,0,2,3,6,4,6,7]
2552 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
2553 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
2554 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
2555 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2556 ; AVX-NEXT: vmovaps %ymm2, 32(%rsi)
2557 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2558 ; AVX-NEXT: vmovaps %ymm2, (%rsi)
2559 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2560 ; AVX-NEXT: vmovaps %ymm2, 32(%rdx)
2561 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2562 ; AVX-NEXT: vmovaps %ymm2, (%rdx)
2563 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2564 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx)
2565 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2566 ; AVX-NEXT: vmovaps %ymm2, (%rcx)
2567 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2568 ; AVX-NEXT: vmovaps %ymm2, 32(%r8)
2569 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2570 ; AVX-NEXT: vmovaps %ymm2, (%r8)
2571 ; AVX-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
2572 ; AVX-NEXT: vmovaps %ymm2, 32(%r9)
2573 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2574 ; AVX-NEXT: vmovaps %ymm2, (%r9)
2575 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
2576 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2577 ; AVX-NEXT: vmovaps %ymm2, 32(%rax)
2578 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2579 ; AVX-NEXT: vmovaps %ymm2, (%rax)
2580 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
2581 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2582 ; AVX-NEXT: vmovaps %ymm2, 32(%rax)
2583 ; AVX-NEXT: vmovaps %ymm11, (%rax)
2584 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
2585 ; AVX-NEXT: vmovaps %ymm0, 32(%rax)
2586 ; AVX-NEXT: vmovaps %ymm1, (%rax)
2587 ; AVX-NEXT: addq $616, %rsp # imm = 0x268
2588 ; AVX-NEXT: vzeroupper
2591 ; AVX2-LABEL: load_i32_stride8_vf16:
2593 ; AVX2-NEXT: subq $456, %rsp # imm = 0x1C8
2594 ; AVX2-NEXT: vmovaps 288(%rdi), %xmm1
2595 ; AVX2-NEXT: vmovaps 256(%rdi), %xmm9
2596 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
2597 ; AVX2-NEXT: vmovaps %xmm1, %xmm8
2598 ; AVX2-NEXT: vmovaps 352(%rdi), %xmm1
2599 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2600 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm2
2601 ; AVX2-NEXT: vmovaps 320(%rdi), %xmm1
2602 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2603 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm3
2604 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2605 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
2606 ; AVX2-NEXT: vmovaps 416(%rdi), %xmm1
2607 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2608 ; AVX2-NEXT: vmovaps 384(%rdi), %xmm2
2609 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2610 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2611 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
2612 ; AVX2-NEXT: vmovaps 480(%rdi), %xmm1
2613 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2614 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm3
2615 ; AVX2-NEXT: vmovaps 448(%rdi), %xmm1
2616 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2617 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm10
2618 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1]
2619 ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
2620 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
2621 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
2622 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2623 ; AVX2-NEXT: vmovaps 160(%rdi), %xmm0
2624 ; AVX2-NEXT: vmovaps 128(%rdi), %xmm15
2625 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
2626 ; AVX2-NEXT: vmovaps %xmm0, %xmm6
2627 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2628 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
2629 ; AVX2-NEXT: vmovaps 224(%rdi), %xmm0
2630 ; AVX2-NEXT: vbroadcastss %xmm0, %xmm3
2631 ; AVX2-NEXT: vmovaps %xmm0, %xmm4
2632 ; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
2633 ; AVX2-NEXT: vmovaps 192(%rdi), %xmm12
2634 ; AVX2-NEXT: vbroadcastss %xmm12, %xmm11
2635 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1]
2636 ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
2637 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7]
2638 ; AVX2-NEXT: vmovaps 96(%rdi), %xmm0
2639 ; AVX2-NEXT: vbroadcastss %xmm0, %xmm2
2640 ; AVX2-NEXT: vmovaps %xmm0, %xmm5
2641 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2642 ; AVX2-NEXT: vmovaps 64(%rdi), %xmm13
2643 ; AVX2-NEXT: vbroadcastss %xmm13, %xmm3
2644 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2645 ; AVX2-NEXT: vmovaps (%rdi), %xmm11
2646 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm10
2647 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
2648 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
2649 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2650 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2651 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1]
2652 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3]
2653 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm5[0],xmm13[1],xmm5[1]
2654 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2655 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
2656 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
2657 ; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm15[1,1,1,1]
2658 ; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3]
2659 ; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
2660 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
2661 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2662 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2663 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm9[1,1,1,1]
2664 ; AVX2-NEXT: vmovaps %xmm8, %xmm6
2665 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3]
2666 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2667 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2668 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm8[0],xmm14[1],xmm8[1]
2669 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2670 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2671 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2672 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2673 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
2674 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2675 ; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm5[1,1,1,1]
2676 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2677 ; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
2678 ; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
2679 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
2680 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2681 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2682 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm6[2],xmm9[3],xmm6[3]
2683 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
2684 ; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2]
2685 ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3]
2686 ; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
2687 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6
2688 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
2689 ; AVX2-NEXT: vmovaps %xmm8, %xmm7
2690 ; AVX2-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,2,2,2]
2691 ; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3]
2692 ; AVX2-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1],xmm6[2,3]
2693 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
2694 ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2695 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm4 # 16-byte Folded Reload
2696 ; AVX2-NEXT: # xmm4 = xmm15[2],mem[2],xmm15[3],mem[3]
2697 ; AVX2-NEXT: vmovaps (%rsp), %xmm9 # 16-byte Reload
2698 ; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2]
2699 ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1,2],xmm5[3]
2700 ; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
2701 ; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6
2702 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
2703 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
2704 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2705 ; AVX2-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2]
2706 ; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3]
2707 ; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3]
2708 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7]
2709 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2710 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm7[2],xmm14[3],xmm7[3]
2711 ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
2712 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2713 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
2714 ; AVX2-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
2715 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm5
2716 ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2717 ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
2718 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
2719 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
2720 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
2721 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm6
2722 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2723 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2724 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2725 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm15
2726 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm9[2],xmm12[3],xmm9[3]
2727 ; AVX2-NEXT: vmovaps (%rdi), %ymm14
2728 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2729 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3]
2730 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
2731 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
2732 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm3
2733 ; AVX2-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill
2734 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm8[2],xmm13[3],xmm8[3]
2735 ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
2736 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2737 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2738 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5]
2739 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
2740 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5]
2741 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[2,2,2,2]
2742 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2743 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm1
2744 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2745 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm7
2746 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm13
2747 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm2 = ymm13[0],ymm7[0],ymm13[1],ymm7[1],ymm13[4],ymm7[4],ymm13[5],ymm7[5]
2748 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
2749 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2]
2750 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
2751 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2752 ; AVX2-NEXT: vmovaps 288(%rdi), %ymm9
2753 ; AVX2-NEXT: vmovaps 256(%rdi), %ymm8
2754 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5]
2755 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
2756 ; AVX2-NEXT: vmovaps 352(%rdi), %ymm1
2757 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2758 ; AVX2-NEXT: vmovaps 320(%rdi), %ymm2
2759 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2760 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
2761 ; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[2,2,2,2]
2762 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm6[2,3]
2763 ; AVX2-NEXT: vmovaps 480(%rdi), %ymm0
2764 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2765 ; AVX2-NEXT: vmovaps 448(%rdi), %ymm12
2766 ; AVX2-NEXT: vmovaps 416(%rdi), %ymm6
2767 ; AVX2-NEXT: vmovaps 384(%rdi), %ymm10
2768 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5]
2769 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm3 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5]
2770 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
2771 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
2772 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2773 ; AVX2-NEXT: vbroadcastss 148(%rdi), %ymm0
2774 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
2775 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
2776 ; AVX2-NEXT: vextractf128 $1, %ymm11, %xmm2
2777 ; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm14[1,1,1,1,5,5,5,5]
2778 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7]
2779 ; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5
2780 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
2781 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
2782 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2783 ; AVX2-NEXT: vbroadcastss 404(%rdi), %ymm0
2784 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
2785 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
2786 ; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm1
2787 ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,1,1,1,5,5,5,5]
2788 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7]
2789 ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2
2790 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2791 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2792 ; AVX2-NEXT: vbroadcastss 248(%rdi), %ymm0
2793 ; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
2794 ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
2795 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7]
2796 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm4 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7]
2797 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2798 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
2799 ; AVX2-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
2800 ; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm1
2801 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm5[2,2,2,2]
2802 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
2803 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
2804 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2805 ; AVX2-NEXT: vbroadcastss 504(%rdi), %ymm0
2806 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7]
2807 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm3 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7]
2808 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm6 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7]
2809 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
2810 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
2811 ; AVX2-NEXT: # ymm7 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7]
2812 ; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm8
2813 ; AVX2-NEXT: vpermpd {{.*#+}} ymm9 = ymm7[2,2,2,2]
2814 ; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3]
2815 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
2816 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
2817 ; AVX2-NEXT: vbroadcastss 220(%rdi), %ymm8
2818 ; AVX2-NEXT: vunpckhps (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload
2819 ; AVX2-NEXT: # ymm8 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7]
2820 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3]
2821 ; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5
2822 ; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7]
2823 ; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm4
2824 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
2825 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
2826 ; AVX2-NEXT: vbroadcastss 476(%rdi), %ymm4
2827 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
2828 ; AVX2-NEXT: # ymm4 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7]
2829 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
2830 ; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm4
2831 ; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7]
2832 ; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5
2833 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
2834 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
2835 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2836 ; AVX2-NEXT: vmovaps %ymm4, 32(%rsi)
2837 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2838 ; AVX2-NEXT: vmovaps %ymm4, (%rsi)
2839 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2840 ; AVX2-NEXT: vmovaps %ymm4, 32(%rdx)
2841 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2842 ; AVX2-NEXT: vmovaps %ymm4, (%rdx)
2843 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2844 ; AVX2-NEXT: vmovaps %ymm4, 32(%rcx)
2845 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2846 ; AVX2-NEXT: vmovaps %ymm4, (%rcx)
2847 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2848 ; AVX2-NEXT: vmovaps %ymm4, 32(%r8)
2849 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2850 ; AVX2-NEXT: vmovaps %ymm4, (%r8)
2851 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2852 ; AVX2-NEXT: vmovaps %ymm4, 32(%r9)
2853 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2854 ; AVX2-NEXT: vmovaps %ymm4, (%r9)
2855 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
2856 ; AVX2-NEXT: vmovaps %ymm11, 32(%rax)
2857 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
2858 ; AVX2-NEXT: vmovaps %ymm4, (%rax)
2859 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
2860 ; AVX2-NEXT: vmovaps %ymm0, 32(%rax)
2861 ; AVX2-NEXT: vmovaps %ymm1, (%rax)
2862 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
2863 ; AVX2-NEXT: vmovaps %ymm3, 32(%rax)
2864 ; AVX2-NEXT: vmovaps %ymm2, (%rax)
2865 ; AVX2-NEXT: addq $456, %rsp # imm = 0x1C8
2866 ; AVX2-NEXT: vzeroupper
2869 ; AVX2-FP-LABEL: load_i32_stride8_vf16:
2871 ; AVX2-FP-NEXT: subq $456, %rsp # imm = 0x1C8
2872 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm1
2873 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm9
2874 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
2875 ; AVX2-FP-NEXT: vmovaps %xmm1, %xmm8
2876 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %xmm1
2877 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2878 ; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm2
2879 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm1
2880 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2881 ; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm3
2882 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2883 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
2884 ; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm1
2885 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2886 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %xmm2
2887 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2888 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2889 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
2890 ; AVX2-FP-NEXT: vmovaps 480(%rdi), %xmm1
2891 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2892 ; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm3
2893 ; AVX2-FP-NEXT: vmovaps 448(%rdi), %xmm1
2894 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2895 ; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm10
2896 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1]
2897 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
2898 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
2899 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
2900 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2901 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm0
2902 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm15
2903 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
2904 ; AVX2-FP-NEXT: vmovaps %xmm0, %xmm6
2905 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2906 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
2907 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm0
2908 ; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm3
2909 ; AVX2-FP-NEXT: vmovaps %xmm0, %xmm4
2910 ; AVX2-FP-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
2911 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm12
2912 ; AVX2-FP-NEXT: vbroadcastss %xmm12, %xmm11
2913 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1]
2914 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
2915 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7]
2916 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm0
2917 ; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm2
2918 ; AVX2-FP-NEXT: vmovaps %xmm0, %xmm5
2919 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2920 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm13
2921 ; AVX2-FP-NEXT: vbroadcastss %xmm13, %xmm3
2922 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2923 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm11
2924 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm10
2925 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
2926 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
2927 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2928 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2929 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1]
2930 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3]
2931 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm5[0],xmm13[1],xmm5[1]
2932 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2933 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
2934 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
2935 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm15[1,1,1,1]
2936 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3]
2937 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
2938 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
2939 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2940 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2941 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm9[1,1,1,1]
2942 ; AVX2-FP-NEXT: vmovaps %xmm8, %xmm6
2943 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3]
2944 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
2945 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2946 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm8[0],xmm14[1],xmm8[1]
2947 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2948 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2949 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2950 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2951 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
2952 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2953 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm5[1,1,1,1]
2954 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2955 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
2956 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
2957 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
2958 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2959 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2960 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm6[2],xmm9[3],xmm6[3]
2961 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
2962 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2]
2963 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3]
2964 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
2965 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6
2966 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
2967 ; AVX2-FP-NEXT: vmovaps %xmm8, %xmm7
2968 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,2,2,2]
2969 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3]
2970 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1],xmm6[2,3]
2971 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
2972 ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2973 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm4 # 16-byte Folded Reload
2974 ; AVX2-FP-NEXT: # xmm4 = xmm15[2],mem[2],xmm15[3],mem[3]
2975 ; AVX2-FP-NEXT: vmovaps (%rsp), %xmm9 # 16-byte Reload
2976 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2]
2977 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1,2],xmm5[3]
2978 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
2979 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6
2980 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
2981 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
2982 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
2983 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2]
2984 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3]
2985 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3]
2986 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7]
2987 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2988 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm7[2],xmm14[3],xmm7[3]
2989 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
2990 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2991 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
2992 ; AVX2-FP-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
2993 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm5
2994 ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2995 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
2996 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
2997 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
2998 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
2999 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm6
3000 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3001 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3002 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3003 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm15
3004 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm9[2],xmm12[3],xmm9[3]
3005 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm14
3006 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
3007 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3]
3008 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
3009 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
3010 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm3
3011 ; AVX2-FP-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill
3012 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm8[2],xmm13[3],xmm8[3]
3013 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
3014 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3015 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3016 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5]
3017 ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0
3018 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5]
3019 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[2,2,2,2]
3020 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
3021 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm1
3022 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3023 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm7
3024 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm13
3025 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm13[0],ymm7[0],ymm13[1],ymm7[1],ymm13[4],ymm7[4],ymm13[5],ymm7[5]
3026 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
3027 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2]
3028 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
3029 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3030 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm9
3031 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm8
3032 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5]
3033 ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0
3034 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm1
3035 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3036 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm2
3037 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3038 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
3039 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[2,2,2,2]
3040 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm6[2,3]
3041 ; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm0
3042 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3043 ; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm12
3044 ; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm6
3045 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm10
3046 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5]
3047 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm3 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5]
3048 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
3049 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3050 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3051 ; AVX2-FP-NEXT: vbroadcastss 148(%rdi), %ymm0
3052 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
3053 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
3054 ; AVX2-FP-NEXT: vextractf128 $1, %ymm11, %xmm2
3055 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm14[1,1,1,1,5,5,5,5]
3056 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7]
3057 ; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5
3058 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
3059 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3060 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3061 ; AVX2-FP-NEXT: vbroadcastss 404(%rdi), %ymm0
3062 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
3063 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
3064 ; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm1
3065 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,1,1,1,5,5,5,5]
3066 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7]
3067 ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2
3068 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
3069 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3070 ; AVX2-FP-NEXT: vbroadcastss 248(%rdi), %ymm0
3071 ; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3072 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
3073 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7]
3074 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7]
3075 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3076 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
3077 ; AVX2-FP-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
3078 ; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm1
3079 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm5[2,2,2,2]
3080 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
3081 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
3082 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3083 ; AVX2-FP-NEXT: vbroadcastss 504(%rdi), %ymm0
3084 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7]
3085 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7]
3086 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7]
3087 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3088 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
3089 ; AVX2-FP-NEXT: # ymm7 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7]
3090 ; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm8
3091 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm9 = ymm7[2,2,2,2]
3092 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3]
3093 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
3094 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
3095 ; AVX2-FP-NEXT: vbroadcastss 220(%rdi), %ymm8
3096 ; AVX2-FP-NEXT: vunpckhps (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload
3097 ; AVX2-FP-NEXT: # ymm8 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7]
3098 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3]
3099 ; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5
3100 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7]
3101 ; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm4
3102 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
3103 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
3104 ; AVX2-FP-NEXT: vbroadcastss 476(%rdi), %ymm4
3105 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
3106 ; AVX2-FP-NEXT: # ymm4 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7]
3107 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
3108 ; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm4
3109 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7]
3110 ; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5
3111 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
3112 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
3113 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3114 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rsi)
3115 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3116 ; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi)
3117 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3118 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rdx)
3119 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3120 ; AVX2-FP-NEXT: vmovaps %ymm4, (%rdx)
3121 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3122 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rcx)
3123 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3124 ; AVX2-FP-NEXT: vmovaps %ymm4, (%rcx)
3125 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3126 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%r8)
3127 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3128 ; AVX2-FP-NEXT: vmovaps %ymm4, (%r8)
3129 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3130 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%r9)
3131 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3132 ; AVX2-FP-NEXT: vmovaps %ymm4, (%r9)
3133 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3134 ; AVX2-FP-NEXT: vmovaps %ymm11, 32(%rax)
3135 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3136 ; AVX2-FP-NEXT: vmovaps %ymm4, (%rax)
3137 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3138 ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax)
3139 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rax)
3140 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3141 ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rax)
3142 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rax)
3143 ; AVX2-FP-NEXT: addq $456, %rsp # imm = 0x1C8
3144 ; AVX2-FP-NEXT: vzeroupper
3145 ; AVX2-FP-NEXT: retq
3147 ; AVX2-FCP-LABEL: load_i32_stride8_vf16:
3148 ; AVX2-FCP: # %bb.0:
3149 ; AVX2-FCP-NEXT: subq $456, %rsp # imm = 0x1C8
3150 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm1
3151 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm9
3152 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm9[0],xmm1[0],xmm9[1],xmm1[1]
3153 ; AVX2-FCP-NEXT: vmovaps %xmm1, %xmm8
3154 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %xmm1
3155 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3156 ; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm2
3157 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %xmm1
3158 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3159 ; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm3
3160 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
3161 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
3162 ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm1
3163 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3164 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %xmm2
3165 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3166 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3167 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
3168 ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %xmm1
3169 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3170 ; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm3
3171 ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %xmm1
3172 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3173 ; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm10
3174 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm10[0],xmm3[0],xmm10[1],xmm3[1]
3175 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
3176 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
3177 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
3178 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3179 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm0
3180 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm15
3181 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
3182 ; AVX2-FCP-NEXT: vmovaps %xmm0, %xmm6
3183 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3184 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
3185 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm0
3186 ; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm3
3187 ; AVX2-FCP-NEXT: vmovaps %xmm0, %xmm4
3188 ; AVX2-FCP-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
3189 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm12
3190 ; AVX2-FCP-NEXT: vbroadcastss %xmm12, %xmm11
3191 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1]
3192 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
3193 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm3[6,7]
3194 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm0
3195 ; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm2
3196 ; AVX2-FCP-NEXT: vmovaps %xmm0, %xmm5
3197 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3198 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm13
3199 ; AVX2-FCP-NEXT: vbroadcastss %xmm13, %xmm3
3200 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
3201 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm11
3202 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm10
3203 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
3204 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
3205 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3206 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3207 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1]
3208 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3]
3209 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm5[0],xmm13[1],xmm5[1]
3210 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
3211 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
3212 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
3213 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm7 = xmm15[1,1,1,1]
3214 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm6[1],xmm7[2,3]
3215 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
3216 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
3217 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3218 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3219 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm9[1,1,1,1]
3220 ; AVX2-FCP-NEXT: vmovaps %xmm8, %xmm6
3221 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm8[1],xmm0[2,3]
3222 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3223 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3224 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm8[0],xmm14[1],xmm8[1]
3225 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
3226 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3227 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3228 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
3229 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
3230 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3231 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm7 = xmm5[1,1,1,1]
3232 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3233 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
3234 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
3235 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
3236 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3237 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3238 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm6[2],xmm9[3],xmm6[3]
3239 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
3240 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm5 = xmm2[2,2,2,2]
3241 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm3[0,1,2],xmm5[3]
3242 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
3243 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm6
3244 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
3245 ; AVX2-FCP-NEXT: vmovaps %xmm8, %xmm7
3246 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm6 = xmm8[2,2,2,2]
3247 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm14[0,1,2],xmm6[3]
3248 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm6 = xmm0[0,1],xmm6[2,3]
3249 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
3250 ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3251 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm4 # 16-byte Folded Reload
3252 ; AVX2-FCP-NEXT: # xmm4 = xmm15[2],mem[2],xmm15[3],mem[3]
3253 ; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm9 # 16-byte Reload
3254 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm5 = xmm9[2,2,2,2]
3255 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm12[0,1,2],xmm5[3]
3256 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
3257 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm6
3258 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
3259 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
3260 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3261 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm3 = xmm8[2,2,2,2]
3262 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm13[0,1,2],xmm3[3]
3263 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3]
3264 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7]
3265 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3266 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm7[2],xmm14[3],xmm7[3]
3267 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
3268 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3269 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
3270 ; AVX2-FCP-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
3271 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm5
3272 ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3273 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
3274 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm1[2,3,2,3]
3275 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
3276 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
3277 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm6
3278 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3279 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3280 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3281 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm15
3282 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm12[2],xmm9[2],xmm12[3],xmm9[3]
3283 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm14
3284 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
3285 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3]
3286 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
3287 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
3288 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm3
3289 ; AVX2-FCP-NEXT: vmovups %ymm3, (%rsp) # 32-byte Spill
3290 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm8[2],xmm13[3],xmm8[3]
3291 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm2[1],xmm1[1]
3292 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3293 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3294 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[4],ymm15[4],ymm14[5],ymm15[5]
3295 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
3296 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[4],ymm5[4],ymm6[5],ymm5[5]
3297 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm11[2,2,2,2]
3298 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
3299 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm1
3300 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3301 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm7
3302 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm13
3303 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm13[0],ymm7[0],ymm13[1],ymm7[1],ymm13[4],ymm7[4],ymm13[5],ymm7[5]
3304 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
3305 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm5[0],ymm2[2],ymm5[2]
3306 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
3307 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3308 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm9
3309 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm8
3310 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm8[0],ymm9[0],ymm8[1],ymm9[1],ymm8[4],ymm9[4],ymm8[5],ymm9[5]
3311 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
3312 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm1
3313 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3314 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm2
3315 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3316 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm4 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
3317 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm6 = ymm4[2,2,2,2]
3318 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm0[0,1],xmm6[2,3]
3319 ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm0
3320 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3321 ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm12
3322 ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm6
3323 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm10
3324 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5]
3325 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm3 = ymm12[0],ymm0[0],ymm12[1],ymm0[1],ymm12[4],ymm0[4],ymm12[5],ymm0[5]
3326 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[2],ymm3[2]
3327 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3328 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3329 ; AVX2-FCP-NEXT: vbroadcastss 148(%rdi), %ymm0
3330 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
3331 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
3332 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm2
3333 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm14[1,1,1,1,5,5,5,5]
3334 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm15[1],ymm5[2,3,4],ymm15[5],ymm5[6,7]
3335 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5
3336 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
3337 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3338 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3339 ; AVX2-FCP-NEXT: vbroadcastss 404(%rdi), %ymm0
3340 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
3341 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
3342 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm1
3343 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm8[1,1,1,1,5,5,5,5]
3344 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm9[1],ymm2[2,3,4],ymm9[5],ymm2[6,7]
3345 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2
3346 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
3347 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3348 ; AVX2-FCP-NEXT: vbroadcastss 248(%rdi), %ymm0
3349 ; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
3350 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
3351 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7]
3352 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[6],ymm15[6],ymm14[7],ymm15[7]
3353 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
3354 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
3355 ; AVX2-FCP-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
3356 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm1
3357 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm5[2,2,2,2]
3358 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm7[2,3]
3359 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
3360 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3361 ; AVX2-FCP-NEXT: vbroadcastss 504(%rdi), %ymm0
3362 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6],ymm0[7]
3363 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7]
3364 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm6 = ymm8[2],ymm9[2],ymm8[3],ymm9[3],ymm8[6],ymm9[6],ymm8[7],ymm9[7]
3365 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
3366 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm7, %ymm7 # 32-byte Folded Reload
3367 ; AVX2-FCP-NEXT: # ymm7 = ymm7[2],mem[2],ymm7[3],mem[3],ymm7[6],mem[6],ymm7[7],mem[7]
3368 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm8
3369 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm9 = ymm7[2,2,2,2]
3370 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3]
3371 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
3372 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
3373 ; AVX2-FCP-NEXT: vbroadcastss 220(%rdi), %ymm8
3374 ; AVX2-FCP-NEXT: vunpckhps (%rsp), %ymm8, %ymm8 # 32-byte Folded Reload
3375 ; AVX2-FCP-NEXT: # ymm8 = ymm8[2],mem[2],ymm8[3],mem[3],ymm8[6],mem[6],ymm8[7],mem[7]
3376 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm2 = ymm2[1],ymm8[1],ymm2[3],ymm8[3]
3377 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5
3378 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,3,2,3,6,7,6,7]
3379 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm4
3380 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
3381 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
3382 ; AVX2-FCP-NEXT: vbroadcastss 476(%rdi), %ymm4
3383 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
3384 ; AVX2-FCP-NEXT: # ymm4 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7]
3385 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm3 = ymm3[1],ymm4[1],ymm3[3],ymm4[3]
3386 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm7, %xmm4
3387 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm6[2,3,2,3,6,7,6,7]
3388 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5
3389 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
3390 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
3391 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3392 ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rsi)
3393 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3394 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi)
3395 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3396 ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rdx)
3397 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3398 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rdx)
3399 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3400 ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rcx)
3401 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3402 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rcx)
3403 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3404 ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%r8)
3405 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3406 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%r8)
3407 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3408 ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%r9)
3409 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3410 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%r9)
3411 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3412 ; AVX2-FCP-NEXT: vmovaps %ymm11, 32(%rax)
3413 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
3414 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rax)
3415 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3416 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax)
3417 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rax)
3418 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3419 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rax)
3420 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rax)
3421 ; AVX2-FCP-NEXT: addq $456, %rsp # imm = 0x1C8
3422 ; AVX2-FCP-NEXT: vzeroupper
3423 ; AVX2-FCP-NEXT: retq
3425 ; AVX512-LABEL: load_i32_stride8_vf16:
3427 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
3428 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
3429 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
3430 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
3431 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2
3432 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1
3433 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm4
3434 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm5
3435 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm3
3436 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm7
3437 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm6
3438 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
3439 ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3440 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm9
3441 ; AVX512-NEXT: vpermt2d %zmm7, %zmm8, %zmm9
3442 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10
3443 ; AVX512-NEXT: vpermt2d %zmm5, %zmm8, %zmm10
3444 ; AVX512-NEXT: movb $-64, %dil
3445 ; AVX512-NEXT: kmovw %edi, %k1
3446 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
3447 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm9
3448 ; AVX512-NEXT: vpermt2d %zmm4, %zmm8, %zmm9
3449 ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm8
3450 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
3451 ; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
3452 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
3453 ; AVX512-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3454 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm10
3455 ; AVX512-NEXT: vpermt2d %zmm7, %zmm9, %zmm10
3456 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm11
3457 ; AVX512-NEXT: vpermt2d %zmm5, %zmm9, %zmm11
3458 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1}
3459 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10
3460 ; AVX512-NEXT: vpermt2d %zmm4, %zmm9, %zmm10
3461 ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm9
3462 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
3463 ; AVX512-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
3464 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
3465 ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3466 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm11
3467 ; AVX512-NEXT: vpermt2d %zmm7, %zmm10, %zmm11
3468 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12
3469 ; AVX512-NEXT: vpermt2d %zmm5, %zmm10, %zmm12
3470 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1}
3471 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm11
3472 ; AVX512-NEXT: vpermt2d %zmm4, %zmm10, %zmm11
3473 ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm10
3474 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
3475 ; AVX512-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10
3476 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
3477 ; AVX512-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3478 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12
3479 ; AVX512-NEXT: vpermt2d %zmm7, %zmm11, %zmm12
3480 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13
3481 ; AVX512-NEXT: vpermt2d %zmm5, %zmm11, %zmm13
3482 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1}
3483 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm12
3484 ; AVX512-NEXT: vpermt2d %zmm4, %zmm11, %zmm12
3485 ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm11
3486 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
3487 ; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11
3488 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
3489 ; AVX512-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3490 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13
3491 ; AVX512-NEXT: vpermt2d %zmm7, %zmm12, %zmm13
3492 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm14
3493 ; AVX512-NEXT: vpermt2d %zmm5, %zmm12, %zmm14
3494 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
3495 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm13
3496 ; AVX512-NEXT: vpermt2d %zmm4, %zmm12, %zmm13
3497 ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm12
3498 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
3499 ; AVX512-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12
3500 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
3501 ; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3502 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm14
3503 ; AVX512-NEXT: vpermt2d %zmm7, %zmm13, %zmm14
3504 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm15
3505 ; AVX512-NEXT: vpermt2d %zmm5, %zmm13, %zmm15
3506 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
3507 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm14
3508 ; AVX512-NEXT: vpermt2d %zmm4, %zmm13, %zmm14
3509 ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm13
3510 ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
3511 ; AVX512-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
3512 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
3513 ; AVX512-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3514 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm15
3515 ; AVX512-NEXT: vpermt2d %zmm7, %zmm14, %zmm15
3516 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm16
3517 ; AVX512-NEXT: vpermt2d %zmm5, %zmm14, %zmm16
3518 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1}
3519 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm15
3520 ; AVX512-NEXT: vpermt2d %zmm4, %zmm14, %zmm15
3521 ; AVX512-NEXT: vpermi2d %zmm2, %zmm0, %zmm14
3522 ; AVX512-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
3523 ; AVX512-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14
3524 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
3525 ; AVX512-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3526 ; AVX512-NEXT: vpermt2d %zmm7, %zmm15, %zmm6
3527 ; AVX512-NEXT: vpermt2d %zmm5, %zmm15, %zmm3
3528 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
3529 ; AVX512-NEXT: vpermt2d %zmm4, %zmm15, %zmm1
3530 ; AVX512-NEXT: vpermt2d %zmm2, %zmm15, %zmm0
3531 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3532 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
3533 ; AVX512-NEXT: vmovdqa64 %zmm8, (%rsi)
3534 ; AVX512-NEXT: vmovdqa64 %zmm9, (%rdx)
3535 ; AVX512-NEXT: vmovdqa64 %zmm10, (%rcx)
3536 ; AVX512-NEXT: vmovdqa64 %zmm11, (%r8)
3537 ; AVX512-NEXT: vmovdqa64 %zmm12, (%r9)
3538 ; AVX512-NEXT: vmovdqa64 %zmm13, (%r11)
3539 ; AVX512-NEXT: vmovdqa64 %zmm14, (%r10)
3540 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rax)
3541 ; AVX512-NEXT: vzeroupper
3544 ; AVX512-FCP-LABEL: load_i32_stride8_vf16:
3545 ; AVX512-FCP: # %bb.0:
3546 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3547 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
3548 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
3549 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
3550 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
3551 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
3552 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4
3553 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5
3554 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3
3555 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
3556 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6
3557 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
3558 ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3559 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm9
3560 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9
3561 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
3562 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm10
3563 ; AVX512-FCP-NEXT: movb $-64, %dil
3564 ; AVX512-FCP-NEXT: kmovw %edi, %k1
3565 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
3566 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm9
3567 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9
3568 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm8
3569 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
3570 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
3571 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
3572 ; AVX512-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3573 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm10
3574 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm10
3575 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm11
3576 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm9, %zmm11
3577 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1}
3578 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
3579 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm10
3580 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm9
3581 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
3582 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
3583 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
3584 ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3585 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
3586 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm11
3587 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
3588 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12
3589 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1}
3590 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm11
3591 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm11
3592 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10
3593 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
3594 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10
3595 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
3596 ; AVX512-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3597 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12
3598 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm11, %zmm12
3599 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
3600 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm13
3601 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1}
3602 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm12
3603 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm11, %zmm12
3604 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm11
3605 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
3606 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11
3607 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
3608 ; AVX512-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3609 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
3610 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13
3611 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm14
3612 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm14
3613 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
3614 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm13
3615 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13
3616 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm12
3617 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
3618 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12
3619 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
3620 ; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3621 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm14
3622 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm14
3623 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm15
3624 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm13, %zmm15
3625 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
3626 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm14
3627 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm14
3628 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm13
3629 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
3630 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
3631 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
3632 ; AVX512-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3633 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm15
3634 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm14, %zmm15
3635 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm16
3636 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm16
3637 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1}
3638 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm15
3639 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm15
3640 ; AVX512-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14
3641 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
3642 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14
3643 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
3644 ; AVX512-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3645 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm6
3646 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3
3647 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
3648 ; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm15, %zmm1
3649 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm0
3650 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3651 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
3652 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, (%rsi)
3653 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, (%rdx)
3654 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, (%rcx)
3655 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%r8)
3656 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, (%r9)
3657 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, (%r11)
3658 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, (%r10)
3659 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
3660 ; AVX512-FCP-NEXT: vzeroupper
3661 ; AVX512-FCP-NEXT: retq
3663 ; AVX512DQ-LABEL: load_i32_stride8_vf16:
3664 ; AVX512DQ: # %bb.0:
3665 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
3666 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
3667 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
3668 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
3669 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm2
3670 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm1
3671 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm4
3672 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm5
3673 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm3
3674 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm7
3675 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm6
3676 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
3677 ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3678 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm9
3679 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm8, %zmm9
3680 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10
3681 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm8, %zmm10
3682 ; AVX512DQ-NEXT: movb $-64, %dil
3683 ; AVX512DQ-NEXT: kmovw %edi, %k1
3684 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
3685 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm9
3686 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm8, %zmm9
3687 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm8
3688 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
3689 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
3690 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
3691 ; AVX512DQ-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3692 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm10
3693 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm9, %zmm10
3694 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm11
3695 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm9, %zmm11
3696 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1}
3697 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10
3698 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm9, %zmm10
3699 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm9
3700 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
3701 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
3702 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
3703 ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3704 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm11
3705 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm10, %zmm11
3706 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12
3707 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm10, %zmm12
3708 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1}
3709 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm11
3710 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm10, %zmm11
3711 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm10
3712 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
3713 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10
3714 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
3715 ; AVX512DQ-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3716 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12
3717 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm11, %zmm12
3718 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13
3719 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm11, %zmm13
3720 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1}
3721 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm12
3722 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm11, %zmm12
3723 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm11
3724 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
3725 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11
3726 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
3727 ; AVX512DQ-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3728 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13
3729 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm12, %zmm13
3730 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm14
3731 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm12, %zmm14
3732 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
3733 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm13
3734 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm12, %zmm13
3735 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm12
3736 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
3737 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12
3738 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
3739 ; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3740 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm14
3741 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm13, %zmm14
3742 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm15
3743 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm13, %zmm15
3744 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
3745 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm14
3746 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm13, %zmm14
3747 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm13
3748 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
3749 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
3750 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
3751 ; AVX512DQ-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3752 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm15
3753 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm14, %zmm15
3754 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm16
3755 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm14, %zmm16
3756 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1}
3757 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm15
3758 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm14, %zmm15
3759 ; AVX512DQ-NEXT: vpermi2d %zmm2, %zmm0, %zmm14
3760 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
3761 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14
3762 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
3763 ; AVX512DQ-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3764 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm15, %zmm6
3765 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm15, %zmm3
3766 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
3767 ; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm15, %zmm1
3768 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm15, %zmm0
3769 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3770 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
3771 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, (%rsi)
3772 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, (%rdx)
3773 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, (%rcx)
3774 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, (%r8)
3775 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%r9)
3776 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, (%r11)
3777 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, (%r10)
3778 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax)
3779 ; AVX512DQ-NEXT: vzeroupper
3780 ; AVX512DQ-NEXT: retq
3782 ; AVX512DQ-FCP-LABEL: load_i32_stride8_vf16:
3783 ; AVX512DQ-FCP: # %bb.0:
3784 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3785 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
3786 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
3787 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
3788 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
3789 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
3790 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4
3791 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5
3792 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3
3793 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
3794 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6
3795 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
3796 ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3797 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm9
3798 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9
3799 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
3800 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm10
3801 ; AVX512DQ-FCP-NEXT: movb $-64, %dil
3802 ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1
3803 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
3804 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm9
3805 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9
3806 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm8
3807 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
3808 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
3809 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
3810 ; AVX512DQ-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3811 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm10
3812 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm10
3813 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm11
3814 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm9, %zmm11
3815 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1}
3816 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
3817 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm10
3818 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm9
3819 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
3820 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
3821 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
3822 ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3823 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
3824 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm11
3825 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
3826 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12
3827 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1}
3828 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm11
3829 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm11
3830 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10
3831 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
3832 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10
3833 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
3834 ; AVX512DQ-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3835 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12
3836 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm11, %zmm12
3837 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
3838 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm13
3839 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1}
3840 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm12
3841 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm11, %zmm12
3842 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm11
3843 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
3844 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11
3845 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
3846 ; AVX512DQ-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3847 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
3848 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13
3849 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm14
3850 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm14
3851 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
3852 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm13
3853 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13
3854 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm12
3855 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
3856 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12
3857 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
3858 ; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3859 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm14
3860 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm14
3861 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm15
3862 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm13, %zmm15
3863 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
3864 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm14
3865 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm14
3866 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm13
3867 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
3868 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
3869 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
3870 ; AVX512DQ-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3871 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm15
3872 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm14, %zmm15
3873 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm16
3874 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm16
3875 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1}
3876 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm15
3877 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm15
3878 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14
3879 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
3880 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14
3881 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
3882 ; AVX512DQ-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3883 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm6
3884 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3
3885 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
3886 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm15, %zmm1
3887 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm0
3888 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3889 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
3890 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, (%rsi)
3891 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, (%rdx)
3892 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, (%rcx)
3893 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%r8)
3894 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, (%r9)
3895 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, (%r11)
3896 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, (%r10)
3897 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
3898 ; AVX512DQ-FCP-NEXT: vzeroupper
3899 ; AVX512DQ-FCP-NEXT: retq
3901 ; AVX512BW-LABEL: load_i32_stride8_vf16:
3902 ; AVX512BW: # %bb.0:
3903 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
3904 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
3905 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
3906 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3907 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2
3908 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1
3909 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4
3910 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm5
3911 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3
3912 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7
3913 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6
3914 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
3915 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3916 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9
3917 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm9
3918 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
3919 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm10
3920 ; AVX512BW-NEXT: movb $-64, %dil
3921 ; AVX512BW-NEXT: kmovd %edi, %k1
3922 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
3923 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9
3924 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm9
3925 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm8
3926 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
3927 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
3928 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
3929 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3930 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10
3931 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm9, %zmm10
3932 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11
3933 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm9, %zmm11
3934 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1}
3935 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10
3936 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm9, %zmm10
3937 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm9
3938 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
3939 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
3940 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
3941 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3942 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11
3943 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm11
3944 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12
3945 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm12
3946 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1}
3947 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11
3948 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm11
3949 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm10
3950 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
3951 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10
3952 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
3953 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3954 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12
3955 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm11, %zmm12
3956 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13
3957 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm11, %zmm13
3958 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1}
3959 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12
3960 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm11, %zmm12
3961 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm11
3962 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
3963 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11
3964 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
3965 ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3966 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13
3967 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm13
3968 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14
3969 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm14
3970 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
3971 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13
3972 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm13
3973 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm12
3974 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
3975 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12
3976 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
3977 ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3978 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14
3979 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm14
3980 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15
3981 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm13, %zmm15
3982 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
3983 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14
3984 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm13, %zmm14
3985 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm13
3986 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
3987 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
3988 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
3989 ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
3990 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15
3991 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm14, %zmm15
3992 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm16
3993 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm14, %zmm16
3994 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1}
3995 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15
3996 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm15
3997 ; AVX512BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm14
3998 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
3999 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14
4000 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
4001 ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4002 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm6
4003 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm15, %zmm3
4004 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
4005 ; AVX512BW-NEXT: vpermt2d %zmm4, %zmm15, %zmm1
4006 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm0
4007 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4008 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
4009 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi)
4010 ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx)
4011 ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rcx)
4012 ; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r8)
4013 ; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r9)
4014 ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r11)
4015 ; AVX512BW-NEXT: vmovdqa64 %zmm14, (%r10)
4016 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax)
4017 ; AVX512BW-NEXT: vzeroupper
4018 ; AVX512BW-NEXT: retq
4020 ; AVX512BW-FCP-LABEL: load_i32_stride8_vf16:
4021 ; AVX512BW-FCP: # %bb.0:
4022 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4023 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
4024 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
4025 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
4026 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
4027 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
4028 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4
4029 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5
4030 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3
4031 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
4032 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6
4033 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
4034 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4035 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9
4036 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9
4037 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
4038 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm10
4039 ; AVX512BW-FCP-NEXT: movb $-64, %dil
4040 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1
4041 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
4042 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9
4043 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9
4044 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm8
4045 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
4046 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
4047 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
4048 ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4049 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10
4050 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm10
4051 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11
4052 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm9, %zmm11
4053 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1}
4054 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
4055 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm10
4056 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm9
4057 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
4058 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
4059 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
4060 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4061 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
4062 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm11
4063 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
4064 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12
4065 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1}
4066 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11
4067 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm11
4068 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10
4069 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
4070 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10
4071 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
4072 ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4073 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12
4074 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm11, %zmm12
4075 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
4076 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm13
4077 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1}
4078 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12
4079 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm11, %zmm12
4080 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm11
4081 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
4082 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11
4083 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
4084 ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4085 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
4086 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13
4087 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14
4088 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm14
4089 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
4090 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13
4091 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13
4092 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm12
4093 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
4094 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12
4095 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
4096 ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4097 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14
4098 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm14
4099 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15
4100 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm13, %zmm15
4101 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
4102 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14
4103 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm14
4104 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm13
4105 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
4106 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
4107 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
4108 ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4109 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15
4110 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm14, %zmm15
4111 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm16
4112 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm16
4113 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1}
4114 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15
4115 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm15
4116 ; AVX512BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14
4117 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
4118 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14
4119 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
4120 ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4121 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm6
4122 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3
4123 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
4124 ; AVX512BW-FCP-NEXT: vpermt2d %zmm4, %zmm15, %zmm1
4125 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm0
4126 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4127 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
4128 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi)
4129 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdx)
4130 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx)
4131 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%r8)
4132 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%r9)
4133 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%r11)
4134 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%r10)
4135 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
4136 ; AVX512BW-FCP-NEXT: vzeroupper
4137 ; AVX512BW-FCP-NEXT: retq
4139 ; AVX512DQ-BW-LABEL: load_i32_stride8_vf16:
4140 ; AVX512DQ-BW: # %bb.0:
4141 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
4142 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
4143 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
4144 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
4145 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2
4146 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1
4147 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm4
4148 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm5
4149 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm3
4150 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm7
4151 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm6
4152 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
4153 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4154 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm9
4155 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm9
4156 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10
4157 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm8, %zmm10
4158 ; AVX512DQ-BW-NEXT: movb $-64, %dil
4159 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1
4160 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
4161 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm9
4162 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm8, %zmm9
4163 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm8
4164 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
4165 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
4166 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
4167 ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4168 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10
4169 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm9, %zmm10
4170 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm11
4171 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm9, %zmm11
4172 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1}
4173 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10
4174 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm9, %zmm10
4175 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm9
4176 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
4177 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
4178 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
4179 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4180 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11
4181 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm11
4182 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12
4183 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm10, %zmm12
4184 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1}
4185 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11
4186 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm10, %zmm11
4187 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm10
4188 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
4189 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10
4190 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
4191 ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4192 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12
4193 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm11, %zmm12
4194 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13
4195 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm11, %zmm13
4196 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1}
4197 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm12
4198 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm11, %zmm12
4199 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm11
4200 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
4201 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11
4202 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
4203 ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4204 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13
4205 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm12, %zmm13
4206 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm14
4207 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm12, %zmm14
4208 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
4209 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13
4210 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm12, %zmm13
4211 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm12
4212 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
4213 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12
4214 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
4215 ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4216 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm14
4217 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm14
4218 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm15
4219 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm13, %zmm15
4220 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
4221 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm14
4222 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm13, %zmm14
4223 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm13
4224 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
4225 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
4226 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
4227 ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4228 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm15
4229 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm14, %zmm15
4230 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm16
4231 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm14, %zmm16
4232 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1}
4233 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm15
4234 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm14, %zmm15
4235 ; AVX512DQ-BW-NEXT: vpermi2d %zmm2, %zmm0, %zmm14
4236 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
4237 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14
4238 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
4239 ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4240 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm15, %zmm6
4241 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm15, %zmm3
4242 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
4243 ; AVX512DQ-BW-NEXT: vpermt2d %zmm4, %zmm15, %zmm1
4244 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm15, %zmm0
4245 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4246 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
4247 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rsi)
4248 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rdx)
4249 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rcx)
4250 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, (%r8)
4251 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%r9)
4252 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%r11)
4253 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%r10)
4254 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax)
4255 ; AVX512DQ-BW-NEXT: vzeroupper
4256 ; AVX512DQ-BW-NEXT: retq
4258 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride8_vf16:
4259 ; AVX512DQ-BW-FCP: # %bb.0:
4260 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
4261 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
4262 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
4263 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
4264 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
4265 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
4266 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4
4267 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5
4268 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3
4269 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
4270 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6
4271 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
4272 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4273 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9
4274 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm9
4275 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
4276 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm8, %zmm10
4277 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %dil
4278 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1
4279 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
4280 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9
4281 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm8, %zmm9
4282 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm8
4283 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
4284 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
4285 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
4286 ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4287 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10
4288 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm9, %zmm10
4289 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11
4290 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm9, %zmm11
4291 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1}
4292 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
4293 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm9, %zmm10
4294 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm9
4295 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
4296 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
4297 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
4298 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4299 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
4300 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm11
4301 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
4302 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm10, %zmm12
4303 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1}
4304 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11
4305 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm10, %zmm11
4306 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm10
4307 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
4308 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10
4309 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
4310 ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4311 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12
4312 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm11, %zmm12
4313 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
4314 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm11, %zmm13
4315 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1}
4316 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12
4317 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm11, %zmm12
4318 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm11
4319 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
4320 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11
4321 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
4322 ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4323 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
4324 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm12, %zmm13
4325 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14
4326 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm12, %zmm14
4327 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
4328 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13
4329 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm12, %zmm13
4330 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm12
4331 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
4332 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12
4333 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
4334 ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4335 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14
4336 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm14
4337 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15
4338 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm13, %zmm15
4339 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
4340 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14
4341 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm13, %zmm14
4342 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm13
4343 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
4344 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
4345 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
4346 ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4347 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15
4348 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm14, %zmm15
4349 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm16
4350 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm14, %zmm16
4351 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1}
4352 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15
4353 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm14, %zmm15
4354 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm2, %zmm0, %zmm14
4355 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
4356 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14
4357 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
4358 ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
4359 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm15, %zmm6
4360 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm15, %zmm3
4361 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
4362 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm4, %zmm15, %zmm1
4363 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm15, %zmm0
4364 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4365 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
4366 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi)
4367 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdx)
4368 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx)
4369 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%r8)
4370 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%r9)
4371 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%r11)
4372 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%r10)
4373 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
4374 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
4375 ; AVX512DQ-BW-FCP-NEXT: retq
4376 %wide.vec = load <128 x i32>, ptr %in.vec, align 64
4377 %strided.vec0 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120>
4378 %strided.vec1 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121>
4379 %strided.vec2 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 66, i32 74, i32 82, i32 90, i32 98, i32 106, i32 114, i32 122>
4380 %strided.vec3 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 67, i32 75, i32 83, i32 91, i32 99, i32 107, i32 115, i32 123>
4381 %strided.vec4 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 68, i32 76, i32 84, i32 92, i32 100, i32 108, i32 116, i32 124>
4382 %strided.vec5 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125>
4383 %strided.vec6 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126>
4384 %strided.vec7 = shufflevector <128 x i32> %wide.vec, <128 x i32> poison, <16 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127>
4385 store <16 x i32> %strided.vec0, ptr %out.vec0, align 64
4386 store <16 x i32> %strided.vec1, ptr %out.vec1, align 64
4387 store <16 x i32> %strided.vec2, ptr %out.vec2, align 64
4388 store <16 x i32> %strided.vec3, ptr %out.vec3, align 64
4389 store <16 x i32> %strided.vec4, ptr %out.vec4, align 64
4390 store <16 x i32> %strided.vec5, ptr %out.vec5, align 64
4391 store <16 x i32> %strided.vec6, ptr %out.vec6, align 64
4392 store <16 x i32> %strided.vec7, ptr %out.vec7, align 64
4396 define void @load_i32_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
4397 ; SSE-LABEL: load_i32_stride8_vf32:
4399 ; SSE-NEXT: subq $952, %rsp # imm = 0x3B8
4400 ; SSE-NEXT: movaps 544(%rdi), %xmm5
4401 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4402 ; SSE-NEXT: movaps 608(%rdi), %xmm6
4403 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4404 ; SSE-NEXT: movaps 576(%rdi), %xmm7
4405 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4406 ; SSE-NEXT: movaps 672(%rdi), %xmm8
4407 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4408 ; SSE-NEXT: movaps 640(%rdi), %xmm4
4409 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4410 ; SSE-NEXT: movaps 736(%rdi), %xmm9
4411 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4412 ; SSE-NEXT: movaps 704(%rdi), %xmm3
4413 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4414 ; SSE-NEXT: movaps 160(%rdi), %xmm10
4415 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4416 ; SSE-NEXT: movaps 128(%rdi), %xmm1
4417 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4418 ; SSE-NEXT: movaps 224(%rdi), %xmm2
4419 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4420 ; SSE-NEXT: movaps 192(%rdi), %xmm0
4421 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4422 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4423 ; SSE-NEXT: movaps %xmm1, %xmm2
4424 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1]
4425 ; SSE-NEXT: movaps %xmm2, %xmm1
4426 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4427 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4428 ; SSE-NEXT: movaps %xmm3, %xmm1
4429 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
4430 ; SSE-NEXT: movaps %xmm4, %xmm3
4431 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
4432 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4433 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4434 ; SSE-NEXT: movaps %xmm3, %xmm0
4435 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4436 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4437 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
4438 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4439 ; SSE-NEXT: movaps %xmm7, %xmm0
4440 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
4441 ; SSE-NEXT: movaps 512(%rdi), %xmm1
4442 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4443 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
4444 ; SSE-NEXT: movaps %xmm1, %xmm2
4445 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4446 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4447 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
4448 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4449 ; SSE-NEXT: movaps 480(%rdi), %xmm1
4450 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4451 ; SSE-NEXT: movaps 448(%rdi), %xmm10
4452 ; SSE-NEXT: movaps %xmm10, %xmm0
4453 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4454 ; SSE-NEXT: movaps 416(%rdi), %xmm3
4455 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4456 ; SSE-NEXT: movaps 384(%rdi), %xmm2
4457 ; SSE-NEXT: movaps %xmm2, %xmm1
4458 ; SSE-NEXT: movaps %xmm2, %xmm14
4459 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
4460 ; SSE-NEXT: movaps %xmm1, %xmm2
4461 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4462 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4463 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
4464 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4465 ; SSE-NEXT: movaps 992(%rdi), %xmm1
4466 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4467 ; SSE-NEXT: movaps 960(%rdi), %xmm15
4468 ; SSE-NEXT: movaps %xmm15, %xmm0
4469 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4470 ; SSE-NEXT: movaps 928(%rdi), %xmm2
4471 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4472 ; SSE-NEXT: movaps 896(%rdi), %xmm1
4473 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4474 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4475 ; SSE-NEXT: movaps %xmm1, %xmm2
4476 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4477 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4478 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
4479 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4480 ; SSE-NEXT: movaps 352(%rdi), %xmm1
4481 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4482 ; SSE-NEXT: movaps 320(%rdi), %xmm12
4483 ; SSE-NEXT: movaps %xmm12, %xmm0
4484 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4485 ; SSE-NEXT: movaps 288(%rdi), %xmm3
4486 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4487 ; SSE-NEXT: movaps 256(%rdi), %xmm1
4488 ; SSE-NEXT: movaps %xmm1, %xmm2
4489 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
4490 ; SSE-NEXT: movaps %xmm2, %xmm3
4491 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm0[0]
4492 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4493 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4494 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4495 ; SSE-NEXT: movaps 864(%rdi), %xmm2
4496 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4497 ; SSE-NEXT: movaps 832(%rdi), %xmm11
4498 ; SSE-NEXT: movaps %xmm11, %xmm0
4499 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4500 ; SSE-NEXT: movaps 800(%rdi), %xmm4
4501 ; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill
4502 ; SSE-NEXT: movaps 768(%rdi), %xmm2
4503 ; SSE-NEXT: movaps %xmm2, %xmm3
4504 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
4505 ; SSE-NEXT: movaps %xmm3, %xmm4
4506 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
4507 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4508 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm0[1]
4509 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4510 ; SSE-NEXT: movaps 96(%rdi), %xmm6
4511 ; SSE-NEXT: movaps 64(%rdi), %xmm9
4512 ; SSE-NEXT: movaps %xmm9, %xmm13
4513 ; SSE-NEXT: unpcklps {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1]
4514 ; SSE-NEXT: movaps (%rdi), %xmm8
4515 ; SSE-NEXT: movaps 32(%rdi), %xmm3
4516 ; SSE-NEXT: movaps %xmm8, %xmm7
4517 ; SSE-NEXT: unpcklps {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
4518 ; SSE-NEXT: movaps %xmm7, %xmm5
4519 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm13[0]
4520 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4521 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm13[1]
4522 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4523 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4524 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4525 ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
4526 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4527 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4528 ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
4529 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4530 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
4531 ; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3]
4532 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4533 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
4534 ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
4535 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4536 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4537 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
4538 ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
4539 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4540 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4541 ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
4542 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
4543 ; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3]
4544 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
4545 ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3]
4546 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4547 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4548 ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
4549 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4550 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
4551 ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3]
4552 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
4553 ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
4554 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4555 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
4556 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
4557 ; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3]
4558 ; SSE-NEXT: unpckhps (%rsp), %xmm2 # 16-byte Folded Reload
4559 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
4560 ; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm6[2],xmm9[3],xmm6[3]
4561 ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm3[2],xmm8[3],xmm3[3]
4562 ; SSE-NEXT: movaps %xmm5, %xmm3
4563 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm4[0]
4564 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4565 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm4[1]
4566 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4567 ; SSE-NEXT: movaps %xmm1, %xmm3
4568 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm12[0]
4569 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4570 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1]
4571 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4572 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4573 ; SSE-NEXT: movaps %xmm6, %xmm1
4574 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0]
4575 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4576 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm10[1]
4577 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4578 ; SSE-NEXT: movaps %xmm0, %xmm3
4579 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm7[0]
4580 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4581 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1]
4582 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4583 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4584 ; SSE-NEXT: movaps %xmm1, %xmm3
4585 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm13[0]
4586 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4587 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm13[1]
4588 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4589 ; SSE-NEXT: movaps %xmm2, %xmm1
4590 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm11[0]
4591 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4592 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1]
4593 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4594 ; SSE-NEXT: movaps %xmm14, %xmm1
4595 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm15[0]
4596 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4597 ; SSE-NEXT: unpckhpd {{.*#+}} xmm14 = xmm14[1],xmm15[1]
4598 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4599 ; SSE-NEXT: movaps %xmm8, %xmm1
4600 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm9[0]
4601 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4602 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm9[1]
4603 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4604 ; SSE-NEXT: movaps 240(%rdi), %xmm2
4605 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4606 ; SSE-NEXT: movaps 208(%rdi), %xmm12
4607 ; SSE-NEXT: movaps %xmm12, %xmm0
4608 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4609 ; SSE-NEXT: movaps 176(%rdi), %xmm3
4610 ; SSE-NEXT: movaps 144(%rdi), %xmm1
4611 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4612 ; SSE-NEXT: movaps %xmm1, %xmm2
4613 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
4614 ; SSE-NEXT: movaps %xmm3, %xmm13
4615 ; SSE-NEXT: movaps %xmm2, %xmm1
4616 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4617 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4618 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4619 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4620 ; SSE-NEXT: movaps 368(%rdi), %xmm4
4621 ; SSE-NEXT: movaps 336(%rdi), %xmm1
4622 ; SSE-NEXT: movaps %xmm1, %xmm0
4623 ; SSE-NEXT: movaps %xmm1, %xmm9
4624 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
4625 ; SSE-NEXT: movaps 304(%rdi), %xmm5
4626 ; SSE-NEXT: movaps 272(%rdi), %xmm8
4627 ; SSE-NEXT: movaps %xmm8, %xmm1
4628 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
4629 ; SSE-NEXT: movaps %xmm1, %xmm2
4630 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4631 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4632 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
4633 ; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
4634 ; SSE-NEXT: movaps 496(%rdi), %xmm7
4635 ; SSE-NEXT: movaps 464(%rdi), %xmm0
4636 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4637 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
4638 ; SSE-NEXT: movaps 432(%rdi), %xmm2
4639 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4640 ; SSE-NEXT: movaps 400(%rdi), %xmm1
4641 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4642 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4643 ; SSE-NEXT: movaps %xmm1, %xmm2
4644 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4645 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4646 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
4647 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4648 ; SSE-NEXT: movaps 624(%rdi), %xmm2
4649 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4650 ; SSE-NEXT: movaps 592(%rdi), %xmm1
4651 ; SSE-NEXT: movaps %xmm1, %xmm0
4652 ; SSE-NEXT: movaps %xmm1, %xmm3
4653 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
4654 ; SSE-NEXT: movaps 560(%rdi), %xmm6
4655 ; SSE-NEXT: movaps 528(%rdi), %xmm1
4656 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4657 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
4658 ; SSE-NEXT: movaps %xmm1, %xmm2
4659 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4660 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4661 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
4662 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4663 ; SSE-NEXT: movaps 752(%rdi), %xmm1
4664 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4665 ; SSE-NEXT: movaps 720(%rdi), %xmm0
4666 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4667 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4668 ; SSE-NEXT: movaps 688(%rdi), %xmm2
4669 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4670 ; SSE-NEXT: movaps 656(%rdi), %xmm1
4671 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4672 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
4673 ; SSE-NEXT: movaps %xmm1, %xmm2
4674 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
4675 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4676 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
4677 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4678 ; SSE-NEXT: movaps 880(%rdi), %xmm1
4679 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4680 ; SSE-NEXT: movaps 848(%rdi), %xmm14
4681 ; SSE-NEXT: movaps %xmm14, %xmm0
4682 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4683 ; SSE-NEXT: movaps 816(%rdi), %xmm1
4684 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4685 ; SSE-NEXT: movaps 784(%rdi), %xmm2
4686 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4687 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4688 ; SSE-NEXT: movaps %xmm2, %xmm1
4689 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4690 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4691 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4692 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4693 ; SSE-NEXT: movaps 1008(%rdi), %xmm1
4694 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4695 ; SSE-NEXT: movaps 976(%rdi), %xmm11
4696 ; SSE-NEXT: movaps %xmm11, %xmm0
4697 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4698 ; SSE-NEXT: movaps 944(%rdi), %xmm1
4699 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4700 ; SSE-NEXT: movaps 912(%rdi), %xmm2
4701 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4702 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4703 ; SSE-NEXT: movaps %xmm2, %xmm1
4704 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4705 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4706 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4707 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4708 ; SSE-NEXT: movaps 112(%rdi), %xmm1
4709 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4710 ; SSE-NEXT: movaps 80(%rdi), %xmm10
4711 ; SSE-NEXT: movaps %xmm10, %xmm0
4712 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4713 ; SSE-NEXT: movaps 16(%rdi), %xmm15
4714 ; SSE-NEXT: movaps 48(%rdi), %xmm1
4715 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4716 ; SSE-NEXT: movaps %xmm15, %xmm2
4717 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4718 ; SSE-NEXT: movaps %xmm2, %xmm1
4719 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
4720 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4721 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
4722 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4723 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
4724 ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
4725 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4726 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm13[2],xmm1[3],xmm13[3]
4727 ; SSE-NEXT: movaps %xmm9, %xmm13
4728 ; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm4[2],xmm13[3],xmm4[3]
4729 ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm5[2],xmm8[3],xmm5[3]
4730 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4731 ; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm7[2],xmm9[3],xmm7[3]
4732 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4733 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
4734 ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
4735 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4736 ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
4737 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4738 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4739 ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
4740 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4741 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
4742 ; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3]
4743 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4744 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4745 ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
4746 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
4747 ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3]
4748 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
4749 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4750 ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
4751 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
4752 ; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3]
4753 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4754 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4755 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
4756 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
4757 ; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3]
4758 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4759 ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
4760 ; SSE-NEXT: movaps %xmm1, %xmm0
4761 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm12[0]
4762 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4763 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm12[1]
4764 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4765 ; SSE-NEXT: movaps %xmm8, %xmm1
4766 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm13[0]
4767 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4768 ; SSE-NEXT: unpckhpd {{.*#+}} xmm8 = xmm8[1],xmm13[1]
4769 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4770 ; SSE-NEXT: movaps %xmm7, %xmm12
4771 ; SSE-NEXT: movlhps {{.*#+}} xmm12 = xmm12[0],xmm9[0]
4772 ; SSE-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm9[1]
4773 ; SSE-NEXT: movaps %xmm5, %xmm13
4774 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4775 ; SSE-NEXT: movlhps {{.*#+}} xmm13 = xmm13[0],xmm1[0]
4776 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
4777 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4778 ; SSE-NEXT: movaps %xmm4, %xmm0
4779 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm6[0]
4780 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm6[1]
4781 ; SSE-NEXT: movaps %xmm0, %xmm5
4782 ; SSE-NEXT: movaps %xmm3, %xmm8
4783 ; SSE-NEXT: movlhps {{.*#+}} xmm8 = xmm8[0],xmm14[0]
4784 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm14[1]
4785 ; SSE-NEXT: movaps %xmm3, %xmm6
4786 ; SSE-NEXT: movaps %xmm2, %xmm3
4787 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm11[0]
4788 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm11[1]
4789 ; SSE-NEXT: movaps %xmm15, %xmm0
4790 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm10[0]
4791 ; SSE-NEXT: unpckhpd {{.*#+}} xmm15 = xmm15[1],xmm10[1]
4792 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4793 ; SSE-NEXT: movaps %xmm1, 96(%rsi)
4794 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4795 ; SSE-NEXT: movaps %xmm1, 32(%rsi)
4796 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4797 ; SSE-NEXT: movaps %xmm1, 112(%rsi)
4798 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4799 ; SSE-NEXT: movaps %xmm1, 48(%rsi)
4800 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4801 ; SSE-NEXT: movaps %xmm1, 64(%rsi)
4802 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4803 ; SSE-NEXT: movaps %xmm1, (%rsi)
4804 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4805 ; SSE-NEXT: movaps %xmm1, 80(%rsi)
4806 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4807 ; SSE-NEXT: movaps %xmm1, 16(%rsi)
4808 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4809 ; SSE-NEXT: movaps %xmm1, 96(%rdx)
4810 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4811 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
4812 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4813 ; SSE-NEXT: movaps %xmm1, 112(%rdx)
4814 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4815 ; SSE-NEXT: movaps %xmm1, 48(%rdx)
4816 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4817 ; SSE-NEXT: movaps %xmm1, 64(%rdx)
4818 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4819 ; SSE-NEXT: movaps %xmm1, (%rdx)
4820 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4821 ; SSE-NEXT: movaps %xmm1, 80(%rdx)
4822 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4823 ; SSE-NEXT: movaps %xmm1, 16(%rdx)
4824 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4825 ; SSE-NEXT: movaps %xmm1, 96(%rcx)
4826 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4827 ; SSE-NEXT: movaps %xmm1, 32(%rcx)
4828 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4829 ; SSE-NEXT: movaps %xmm1, 112(%rcx)
4830 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4831 ; SSE-NEXT: movaps %xmm1, 48(%rcx)
4832 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4833 ; SSE-NEXT: movaps %xmm1, 64(%rcx)
4834 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4835 ; SSE-NEXT: movaps %xmm1, (%rcx)
4836 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4837 ; SSE-NEXT: movaps %xmm1, 80(%rcx)
4838 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4839 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
4840 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4841 ; SSE-NEXT: movaps %xmm1, 112(%r8)
4842 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4843 ; SSE-NEXT: movaps %xmm1, 96(%r8)
4844 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4845 ; SSE-NEXT: movaps %xmm1, 80(%r8)
4846 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4847 ; SSE-NEXT: movaps %xmm1, 64(%r8)
4848 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4849 ; SSE-NEXT: movaps %xmm1, 48(%r8)
4850 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4851 ; SSE-NEXT: movaps %xmm1, 32(%r8)
4852 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4853 ; SSE-NEXT: movaps %xmm1, 16(%r8)
4854 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4855 ; SSE-NEXT: movaps %xmm1, (%r8)
4856 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4857 ; SSE-NEXT: movaps %xmm1, 112(%r9)
4858 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4859 ; SSE-NEXT: movaps %xmm1, 96(%r9)
4860 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4861 ; SSE-NEXT: movaps %xmm1, 80(%r9)
4862 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4863 ; SSE-NEXT: movaps %xmm1, 64(%r9)
4864 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4865 ; SSE-NEXT: movaps %xmm1, 48(%r9)
4866 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4867 ; SSE-NEXT: movaps %xmm1, 32(%r9)
4868 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4869 ; SSE-NEXT: movaps %xmm1, 16(%r9)
4870 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4871 ; SSE-NEXT: movaps %xmm1, (%r9)
4872 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
4873 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4874 ; SSE-NEXT: movaps %xmm1, 112(%rax)
4875 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4876 ; SSE-NEXT: movaps %xmm1, 96(%rax)
4877 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4878 ; SSE-NEXT: movaps %xmm1, 80(%rax)
4879 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4880 ; SSE-NEXT: movaps %xmm1, 64(%rax)
4881 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4882 ; SSE-NEXT: movaps %xmm1, 48(%rax)
4883 ; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
4884 ; SSE-NEXT: movaps %xmm1, 32(%rax)
4885 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4886 ; SSE-NEXT: movaps %xmm1, 16(%rax)
4887 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4888 ; SSE-NEXT: movaps %xmm1, (%rax)
4889 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
4890 ; SSE-NEXT: movaps %xmm3, 112(%rax)
4891 ; SSE-NEXT: movaps %xmm8, 96(%rax)
4892 ; SSE-NEXT: movaps %xmm4, 80(%rax)
4893 ; SSE-NEXT: movaps %xmm13, 64(%rax)
4894 ; SSE-NEXT: movaps %xmm12, 48(%rax)
4895 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4896 ; SSE-NEXT: movaps %xmm1, 32(%rax)
4897 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4898 ; SSE-NEXT: movaps %xmm1, 16(%rax)
4899 ; SSE-NEXT: movaps %xmm0, (%rax)
4900 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
4901 ; SSE-NEXT: movaps %xmm2, 112(%rax)
4902 ; SSE-NEXT: movaps %xmm6, 96(%rax)
4903 ; SSE-NEXT: movaps %xmm5, 80(%rax)
4904 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4905 ; SSE-NEXT: movaps %xmm0, 64(%rax)
4906 ; SSE-NEXT: movaps %xmm7, 48(%rax)
4907 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4908 ; SSE-NEXT: movaps %xmm0, 32(%rax)
4909 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4910 ; SSE-NEXT: movaps %xmm0, 16(%rax)
4911 ; SSE-NEXT: movaps %xmm15, (%rax)
4912 ; SSE-NEXT: addq $952, %rsp # imm = 0x3B8
4915 ; AVX-LABEL: load_i32_stride8_vf32:
4917 ; AVX-NEXT: subq $1800, %rsp # imm = 0x708
4918 ; AVX-NEXT: vmovaps 288(%rdi), %xmm14
4919 ; AVX-NEXT: vmovaps 256(%rdi), %xmm10
4920 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm10[0],xmm14[0],xmm10[1],xmm14[1]
4921 ; AVX-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4922 ; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4923 ; AVX-NEXT: vmovaps 352(%rdi), %xmm1
4924 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4925 ; AVX-NEXT: vmovaps 320(%rdi), %xmm2
4926 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4927 ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4928 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4929 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4930 ; AVX-NEXT: vmovaps 416(%rdi), %xmm1
4931 ; AVX-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
4932 ; AVX-NEXT: vmovaps 384(%rdi), %xmm2
4933 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4934 ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4935 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
4936 ; AVX-NEXT: vmovaps 480(%rdi), %xmm2
4937 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4938 ; AVX-NEXT: vmovaps 448(%rdi), %xmm3
4939 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4940 ; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4941 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4942 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1]
4943 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
4944 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
4945 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4946 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4947 ; AVX-NEXT: vmovaps 928(%rdi), %xmm0
4948 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4949 ; AVX-NEXT: vmovaps 896(%rdi), %xmm1
4950 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4951 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4952 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4953 ; AVX-NEXT: vmovaps 992(%rdi), %xmm1
4954 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4955 ; AVX-NEXT: vmovaps 960(%rdi), %xmm2
4956 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4957 ; AVX-NEXT: vunpcklps {{.*#+}} xmm9 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4958 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm9[0,1,0,1]
4959 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
4960 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
4961 ; AVX-NEXT: vmovaps 800(%rdi), %xmm1
4962 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4963 ; AVX-NEXT: vmovaps 768(%rdi), %xmm11
4964 ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm1[0],xmm11[1],xmm1[1]
4965 ; AVX-NEXT: vmovaps 864(%rdi), %xmm2
4966 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4967 ; AVX-NEXT: vmovaps 832(%rdi), %xmm3
4968 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4969 ; AVX-NEXT: vunpcklps {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4970 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm7[0]
4971 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4972 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4973 ; AVX-NEXT: vmovaps 160(%rdi), %xmm1
4974 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4975 ; AVX-NEXT: vmovaps 128(%rdi), %xmm0
4976 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4977 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
4978 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4979 ; AVX-NEXT: vmovaps 224(%rdi), %xmm1
4980 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4981 ; AVX-NEXT: vmovaps 192(%rdi), %xmm2
4982 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4983 ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4984 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4985 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
4986 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
4987 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
4988 ; AVX-NEXT: vmovaps 32(%rdi), %xmm1
4989 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4990 ; AVX-NEXT: vmovaps (%rdi), %xmm13
4991 ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
4992 ; AVX-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4993 ; AVX-NEXT: vmovaps 96(%rdi), %xmm2
4994 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4995 ; AVX-NEXT: vmovaps 64(%rdi), %xmm3
4996 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4997 ; AVX-NEXT: vunpcklps {{.*#+}} xmm5 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4998 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm5[0]
4999 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5000 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5001 ; AVX-NEXT: vmovaps 672(%rdi), %xmm12
5002 ; AVX-NEXT: vmovaps 640(%rdi), %xmm0
5003 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5004 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
5005 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5006 ; AVX-NEXT: vmovaps 736(%rdi), %xmm1
5007 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5008 ; AVX-NEXT: vmovaps 704(%rdi), %xmm2
5009 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5010 ; AVX-NEXT: vunpcklps {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5011 ; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm8[0,1,0,1]
5012 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
5013 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm4[6,7]
5014 ; AVX-NEXT: vmovaps 544(%rdi), %xmm6
5015 ; AVX-NEXT: vmovaps 512(%rdi), %xmm3
5016 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm3[0],xmm6[0],xmm3[1],xmm6[1]
5017 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5018 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5019 ; AVX-NEXT: vmovaps 608(%rdi), %xmm4
5020 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5021 ; AVX-NEXT: vmovaps 576(%rdi), %xmm2
5022 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5023 ; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
5024 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
5025 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5026 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5027 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1]
5028 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3]
5029 ; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5030 ; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
5031 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
5032 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5033 ; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm4[1,1,1,1]
5034 ; AVX-NEXT: vmovaps (%rsp), %xmm14 # 16-byte Reload
5035 ; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm14[1],xmm15[2,3]
5036 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
5037 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm15[0,1,2,3,4,5],ymm1[6,7]
5038 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5039 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5040 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm11[1,1,1,1]
5041 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5042 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3]
5043 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
5044 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm1
5045 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5046 ; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm9[1,1,1,1]
5047 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5048 ; AVX-NEXT: vblendps {{.*#+}} xmm7 = xmm7[0],xmm15[1],xmm7[2,3]
5049 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
5050 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
5051 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5052 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5053 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm13[1,1,1,1]
5054 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5055 ; AVX-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
5056 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
5057 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
5058 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5059 ; AVX-NEXT: # xmm5 = mem[1,1,1,1]
5060 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5061 ; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm13[1],xmm5[2,3]
5062 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
5063 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
5064 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5065 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5066 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm3[1,1,1,1]
5067 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3]
5068 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
5069 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm1
5070 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5071 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm5[1,1,1,1]
5072 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm12[1],xmm2[2,3]
5073 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
5074 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5075 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5076 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5077 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5078 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
5079 ; AVX-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3]
5080 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5081 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm4[2],xmm14[2],xmm4[3],xmm14[3]
5082 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5083 ; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
5084 ; AVX-NEXT: # xmm0 = mem[2,2,2,2]
5085 ; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5086 ; AVX-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
5087 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5088 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm2
5089 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
5090 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
5091 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm7[2,2,2,2]
5092 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5093 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
5094 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
5095 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
5096 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5097 ; AVX-NEXT: vunpckhps {{.*#+}} xmm4 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
5098 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5099 ; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm9[2],xmm15[2],xmm9[3],xmm15[3]
5100 ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
5101 ; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5102 ; AVX-NEXT: # xmm2 = mem[2,2,2,2]
5103 ; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
5104 ; AVX-NEXT: # xmm2 = mem[0,1,2],xmm2[3]
5105 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
5106 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3
5107 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
5108 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5109 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm14[2,2,2,2]
5110 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5111 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[3]
5112 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
5113 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
5114 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5115 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5116 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm4 # 16-byte Folded Reload
5117 ; AVX-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3]
5118 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5119 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5120 ; AVX-NEXT: vunpckhps {{.*#+}} xmm3 = xmm2[2],xmm13[2],xmm2[3],xmm13[3]
5121 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5122 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5123 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm15[2,2,2,2]
5124 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5125 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm13[0,1,2],xmm2[3]
5126 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
5127 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm6
5128 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
5129 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5130 ; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm9[2,2,2,2]
5131 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5132 ; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm3[0,1,2],xmm6[3]
5133 ; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3]
5134 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
5135 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5136 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5137 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm11 # 16-byte Folded Reload
5138 ; AVX-NEXT: # xmm11 = xmm2[2],mem[2],xmm2[3],mem[3]
5139 ; AVX-NEXT: vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm12[2],xmm5[3],xmm12[3]
5140 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5141 ; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm10[2,2,2,2]
5142 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5143 ; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm5[0,1,2],xmm6[3]
5144 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
5145 ; AVX-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm8
5146 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
5147 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5148 ; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm4[2,2,2,2]
5149 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5150 ; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm2[0,1,2],xmm8[3]
5151 ; AVX-NEXT: vblendps {{.*#+}} xmm8 = xmm11[0,1],xmm8[2,3]
5152 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm8[0,1,2,3],ymm6[4,5,6,7]
5153 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5154 ; AVX-NEXT: vunpckhps {{.*#+}} xmm6 = xmm1[2],xmm7[2],xmm1[3],xmm7[3]
5155 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5156 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm6 = xmm1[1],xmm6[1]
5157 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5158 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload
5159 ; AVX-NEXT: # xmm7 = xmm1[2],mem[2],xmm1[3],mem[3]
5160 ; AVX-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
5161 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5162 ; AVX-NEXT: # xmm1 = mem[2,3,2,3]
5163 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
5164 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm7[6,7]
5165 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
5166 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5167 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
5168 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5169 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm0[1],xmm1[1]
5170 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5171 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
5172 ; AVX-NEXT: # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3]
5173 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
5174 ; AVX-NEXT: vpermilps $238, (%rsp), %xmm0 # 16-byte Folded Reload
5175 ; AVX-NEXT: # xmm0 = mem[2,3,2,3]
5176 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5177 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm6[6,7]
5178 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5179 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5180 ; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm3[2],xmm9[2],xmm3[3],xmm9[3]
5181 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5182 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
5183 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm13[2],xmm15[2],xmm13[3],xmm15[3]
5184 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
5185 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5186 ; AVX-NEXT: # xmm3 = mem[2,3,2,3]
5187 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
5188 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
5189 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5190 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5191 ; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
5192 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm11[1],xmm0[1]
5193 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm5[2],xmm10[2],xmm5[3],xmm10[3]
5194 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
5195 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3]
5196 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
5197 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5198 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5199 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5200 ; AVX-NEXT: vmovaps 416(%rdi), %ymm2
5201 ; AVX-NEXT: vmovaps 384(%rdi), %ymm3
5202 ; AVX-NEXT: vmovaps 448(%rdi), %ymm1
5203 ; AVX-NEXT: vmovaps 480(%rdi), %ymm12
5204 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm12[0],ymm1[0],ymm12[2],ymm1[2]
5205 ; AVX-NEXT: vmovaps %ymm1, %ymm9
5206 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5207 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
5208 ; AVX-NEXT: vmovaps %ymm3, %ymm5
5209 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5210 ; AVX-NEXT: vmovaps %ymm2, %ymm13
5211 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
5212 ; AVX-NEXT: vmovaps 288(%rdi), %ymm2
5213 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5214 ; AVX-NEXT: vmovaps 256(%rdi), %ymm1
5215 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5216 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
5217 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
5218 ; AVX-NEXT: vmovaps 320(%rdi), %ymm2
5219 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5220 ; AVX-NEXT: vmovaps 352(%rdi), %ymm4
5221 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm2[0],ymm4[2],ymm2[2]
5222 ; AVX-NEXT: vmovaps %ymm4, %ymm11
5223 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5224 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
5225 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
5226 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
5227 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5228 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5229 ; AVX-NEXT: vmovaps 928(%rdi), %ymm2
5230 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5231 ; AVX-NEXT: vmovaps 896(%rdi), %ymm3
5232 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5233 ; AVX-NEXT: vmovaps 960(%rdi), %ymm1
5234 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5235 ; AVX-NEXT: vmovaps 992(%rdi), %ymm0
5236 ; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
5237 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
5238 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
5239 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
5240 ; AVX-NEXT: vmovaps 800(%rdi), %ymm1
5241 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5242 ; AVX-NEXT: vmovaps 768(%rdi), %ymm2
5243 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5244 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
5245 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
5246 ; AVX-NEXT: vmovaps 832(%rdi), %ymm3
5247 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5248 ; AVX-NEXT: vmovaps 864(%rdi), %ymm2
5249 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5250 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
5251 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
5252 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
5253 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
5254 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5255 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5256 ; AVX-NEXT: vmovaps 672(%rdi), %ymm2
5257 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5258 ; AVX-NEXT: vmovaps 640(%rdi), %ymm1
5259 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5260 ; AVX-NEXT: vmovaps 704(%rdi), %ymm0
5261 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5262 ; AVX-NEXT: vmovaps 736(%rdi), %ymm4
5263 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm4[0],ymm0[0],ymm4[2],ymm0[2]
5264 ; AVX-NEXT: vmovaps %ymm4, %ymm10
5265 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5266 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
5267 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
5268 ; AVX-NEXT: vmovaps 544(%rdi), %ymm0
5269 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5270 ; AVX-NEXT: vmovaps 512(%rdi), %ymm1
5271 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5272 ; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
5273 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm6
5274 ; AVX-NEXT: vmovaps 576(%rdi), %ymm0
5275 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5276 ; AVX-NEXT: vmovaps 608(%rdi), %ymm1
5277 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5278 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[2],ymm0[2]
5279 ; AVX-NEXT: vshufps {{.*#+}} ymm8 = ymm8[0,1,2,0,4,5,6,4]
5280 ; AVX-NEXT: vextractf128 $1, %ymm8, %xmm8
5281 ; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3]
5282 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm3[4,5,6,7]
5283 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5284 ; AVX-NEXT: vmovaps 160(%rdi), %ymm0
5285 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5286 ; AVX-NEXT: vmovaps 128(%rdi), %ymm1
5287 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5288 ; AVX-NEXT: vmovaps 192(%rdi), %ymm2
5289 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5290 ; AVX-NEXT: vmovaps 224(%rdi), %ymm3
5291 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5292 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm3[0],ymm2[0],ymm3[2],ymm2[2]
5293 ; AVX-NEXT: vunpcklps {{.*#+}} ymm6 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
5294 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm6[0,1],ymm3[2,0],ymm6[4,5],ymm3[6,4]
5295 ; AVX-NEXT: vmovaps 64(%rdi), %ymm14
5296 ; AVX-NEXT: vmovaps 96(%rdi), %ymm2
5297 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5298 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm3 = ymm2[0],ymm14[0],ymm2[2],ymm14[2]
5299 ; AVX-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5300 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
5301 ; AVX-NEXT: vextractf128 $1, %ymm3, %xmm0
5302 ; AVX-NEXT: vmovaps (%rdi), %ymm2
5303 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5304 ; AVX-NEXT: vmovaps 32(%rdi), %ymm3
5305 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5306 ; AVX-NEXT: vunpcklps {{.*#+}} ymm15 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[4],ymm3[4],ymm2[5],ymm3[5]
5307 ; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15
5308 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
5309 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5310 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5311 ; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5312 ; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm9[0],ymm12[0],ymm9[1],ymm12[1],ymm9[4],ymm12[4],ymm9[5],ymm12[5]
5313 ; AVX-NEXT: vmovaps %ymm13, %ymm8
5314 ; AVX-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5315 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm13[1,0],ymm5[1,0],ymm13[5,4],ymm5[5,4]
5316 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
5317 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
5318 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[4],ymm11[4],ymm13[5],ymm11[5]
5319 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
5320 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5321 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5322 ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm3[1,0],ymm5[1,0],ymm3[5,4],ymm5[5,4]
5323 ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7]
5324 ; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15
5325 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
5326 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5327 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5328 ; AVX-NEXT: vmovups (%rsp), %ymm11 # 32-byte Reload
5329 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
5330 ; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm11[0],ymm7[1],ymm11[1],ymm7[4],ymm11[4],ymm7[5],ymm11[5]
5331 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
5332 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5333 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[1,0],ymm9[1,0],ymm1[5,4],ymm9[5,4]
5334 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
5335 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5336 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
5337 ; AVX-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5]
5338 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
5339 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5340 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5341 ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm6[1,0],ymm4[1,0],ymm6[5,4],ymm4[5,4]
5342 ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7]
5343 ; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15
5344 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
5345 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5346 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5347 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5348 ; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm10[0],ymm0[1],ymm10[1],ymm0[4],ymm10[4],ymm0[5],ymm10[5]
5349 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
5350 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5351 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm6[1,0],ymm10[1,0],ymm6[5,4],ymm10[5,4]
5352 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
5353 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5354 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
5355 ; AVX-NEXT: # ymm1 = ymm1[0],mem[0],ymm1[1],mem[1],ymm1[4],mem[4],ymm1[5],mem[5]
5356 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
5357 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
5358 ; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm15 # 32-byte Folded Reload
5359 ; AVX-NEXT: # ymm15 = ymm15[1,0],mem[1,0],ymm15[5,4],mem[5,4]
5360 ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7]
5361 ; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15
5362 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
5363 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7]
5364 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5365 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5366 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5367 ; AVX-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
5368 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5369 ; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
5370 ; AVX-NEXT: # ymm1 = ymm1[1,0],mem[1,0],ymm1[5,4],mem[5,4]
5371 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
5372 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm1 # 32-byte Folded Reload
5373 ; AVX-NEXT: # ymm1 = ymm14[0],mem[0],ymm14[1],mem[1],ymm14[4],mem[4],ymm14[5],mem[5]
5374 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
5375 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
5376 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5377 ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm14[1,0],ymm2[1,0],ymm14[5,4],ymm2[5,4]
5378 ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7]
5379 ; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15
5380 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
5381 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5382 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5383 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm0 # 32-byte Folded Reload
5384 ; AVX-NEXT: # ymm0 = ymm12[1],mem[1],ymm12[3],mem[3]
5385 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5386 ; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm8[2],ymm1[3],ymm8[3],ymm1[6],ymm8[6],ymm1[7],ymm8[7]
5387 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
5388 ; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[6],ymm3[6],ymm5[7],ymm3[7]
5389 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
5390 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
5391 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm12[1],ymm13[1],ymm12[3],ymm13[3]
5392 ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
5393 ; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15
5394 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
5395 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5396 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5397 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm11[1],ymm7[1],ymm11[3],ymm7[3]
5398 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload
5399 ; AVX-NEXT: # ymm1 = ymm9[2],mem[2],ymm9[3],mem[3],ymm9[6],mem[6],ymm9[7],mem[7]
5400 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
5401 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm1 # 32-byte Folded Reload
5402 ; AVX-NEXT: # ymm1 = ymm4[2],mem[2],ymm4[3],mem[3],ymm4[6],mem[6],ymm4[7],mem[7]
5403 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
5404 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5405 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm15 # 32-byte Folded Reload
5406 ; AVX-NEXT: # ymm15 = ymm3[1],mem[1],ymm3[3],mem[3]
5407 ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
5408 ; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15
5409 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
5410 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5411 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5412 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
5413 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
5414 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm5[1],ymm7[1],ymm5[3],ymm7[3]
5415 ; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[6],ymm6[6],ymm10[7],ymm6[7]
5416 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
5417 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5418 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5419 ; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm6[2],ymm8[2],ymm6[3],ymm8[3],ymm6[6],ymm8[6],ymm6[7],ymm8[7]
5420 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
5421 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
5422 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5423 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm15 = ymm4[1],ymm3[1],ymm4[3],ymm3[3]
5424 ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[0,1,2,0,4,5,6,4]
5425 ; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15
5426 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
5427 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5428 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5429 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
5430 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5431 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm11[1],ymm0[3],ymm11[3]
5432 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
5433 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
5434 ; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm13[2],ymm10[2],ymm13[3],ymm10[3],ymm13[6],ymm10[6],ymm13[7],ymm10[7]
5435 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
5436 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
5437 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm1 # 32-byte Folded Reload
5438 ; AVX-NEXT: # ymm1 = ymm9[1],mem[1],ymm9[3],mem[3]
5439 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
5440 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
5441 ; AVX-NEXT: vunpckhps {{.*#+}} ymm15 = ymm2[2],ymm14[2],ymm2[3],ymm14[3],ymm2[6],ymm14[6],ymm2[7],ymm14[7]
5442 ; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15
5443 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
5444 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5445 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5446 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5447 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
5448 ; AVX-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
5449 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5450 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
5451 ; AVX-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
5452 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
5453 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5454 ; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm1[2],ymm12[2],ymm1[3],ymm12[3],ymm1[6],ymm12[6],ymm1[7],ymm12[7]
5455 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5456 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm15 # 32-byte Folded Reload
5457 ; AVX-NEXT: # ymm15 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4]
5458 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
5459 ; AVX-NEXT: vshufps {{.*#+}} ymm15 = ymm15[2,0,2,3,6,4,6,7]
5460 ; AVX-NEXT: vextractf128 $1, %ymm15, %xmm15
5461 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
5462 ; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5463 ; AVX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7]
5464 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5465 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
5466 ; AVX-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
5467 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
5468 ; AVX-NEXT: vunpckhps {{.*#+}} ymm1 = ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[6],ymm4[6],ymm3[7],ymm4[7]
5469 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm8[3,0],ymm6[3,0],ymm8[7,4],ymm6[7,4]
5470 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
5471 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
5472 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
5473 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
5474 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5475 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5476 ; AVX-NEXT: vunpckhps (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
5477 ; AVX-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
5478 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5479 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
5480 ; AVX-NEXT: # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4]
5481 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm2[2,0],ymm1[2,3],ymm2[6,4],ymm1[6,7]
5482 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5483 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
5484 ; AVX-NEXT: # ymm2 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
5485 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5486 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm4 # 32-byte Folded Reload
5487 ; AVX-NEXT: # ymm4 = ymm3[3,0],mem[3,0],ymm3[7,4],mem[7,4]
5488 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
5489 ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm4[2,0,2,3,6,4,6,7]
5490 ; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4
5491 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
5492 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5493 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm2 # 32-byte Folded Reload
5494 ; AVX-NEXT: # ymm2 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7]
5495 ; AVX-NEXT: vshufps {{.*#+}} ymm4 = ymm10[3,0],ymm13[3,0],ymm10[7,4],ymm13[7,4]
5496 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm4[2,0],ymm2[2,3],ymm4[6,4],ymm2[6,7]
5497 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5498 ; AVX-NEXT: vunpckhps {{.*#+}} ymm4 = ymm3[2],ymm9[2],ymm3[3],ymm9[3],ymm3[6],ymm9[6],ymm3[7],ymm9[7]
5499 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm3 # 32-byte Folded Reload
5500 ; AVX-NEXT: # ymm3 = ymm14[3,0],mem[3,0],ymm14[7,4],mem[7,4]
5501 ; AVX-NEXT: vextractf128 $1, %ymm4, %xmm4
5502 ; AVX-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,0,2,3,6,4,6,7]
5503 ; AVX-NEXT: vextractf128 $1, %ymm3, %xmm3
5504 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
5505 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
5506 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5507 ; AVX-NEXT: vmovaps %ymm3, 64(%rsi)
5508 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5509 ; AVX-NEXT: vmovaps %ymm3, (%rsi)
5510 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5511 ; AVX-NEXT: vmovaps %ymm3, 96(%rsi)
5512 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5513 ; AVX-NEXT: vmovaps %ymm3, 32(%rsi)
5514 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5515 ; AVX-NEXT: vmovaps %ymm3, 64(%rdx)
5516 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5517 ; AVX-NEXT: vmovaps %ymm3, (%rdx)
5518 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5519 ; AVX-NEXT: vmovaps %ymm3, 96(%rdx)
5520 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5521 ; AVX-NEXT: vmovaps %ymm3, 32(%rdx)
5522 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5523 ; AVX-NEXT: vmovaps %ymm3, 64(%rcx)
5524 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5525 ; AVX-NEXT: vmovaps %ymm3, (%rcx)
5526 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5527 ; AVX-NEXT: vmovaps %ymm3, 96(%rcx)
5528 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5529 ; AVX-NEXT: vmovaps %ymm3, 32(%rcx)
5530 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5531 ; AVX-NEXT: vmovaps %ymm3, 64(%r8)
5532 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5533 ; AVX-NEXT: vmovaps %ymm3, (%r8)
5534 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5535 ; AVX-NEXT: vmovaps %ymm3, 96(%r8)
5536 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5537 ; AVX-NEXT: vmovaps %ymm3, 32(%r8)
5538 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5539 ; AVX-NEXT: vmovaps %ymm3, 64(%r9)
5540 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5541 ; AVX-NEXT: vmovaps %ymm3, (%r9)
5542 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5543 ; AVX-NEXT: vmovaps %ymm3, 96(%r9)
5544 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5545 ; AVX-NEXT: vmovaps %ymm3, 32(%r9)
5546 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
5547 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5548 ; AVX-NEXT: vmovaps %ymm3, 64(%rax)
5549 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5550 ; AVX-NEXT: vmovaps %ymm3, (%rax)
5551 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5552 ; AVX-NEXT: vmovaps %ymm3, 96(%rax)
5553 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5554 ; AVX-NEXT: vmovaps %ymm3, 32(%rax)
5555 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
5556 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5557 ; AVX-NEXT: vmovaps %ymm3, 64(%rax)
5558 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5559 ; AVX-NEXT: vmovaps %ymm3, (%rax)
5560 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5561 ; AVX-NEXT: vmovaps %ymm3, 96(%rax)
5562 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5563 ; AVX-NEXT: vmovaps %ymm3, 32(%rax)
5564 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
5565 ; AVX-NEXT: vmovaps %ymm1, 96(%rax)
5566 ; AVX-NEXT: vmovaps %ymm0, 64(%rax)
5567 ; AVX-NEXT: vmovaps %ymm12, 32(%rax)
5568 ; AVX-NEXT: vmovaps %ymm2, (%rax)
5569 ; AVX-NEXT: addq $1800, %rsp # imm = 0x708
5570 ; AVX-NEXT: vzeroupper
5573 ; AVX2-LABEL: load_i32_stride8_vf32:
5575 ; AVX2-NEXT: subq $1544, %rsp # imm = 0x608
5576 ; AVX2-NEXT: vmovaps 288(%rdi), %xmm0
5577 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5578 ; AVX2-NEXT: vmovaps 256(%rdi), %xmm2
5579 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
5580 ; AVX2-NEXT: vmovaps %xmm2, %xmm10
5581 ; AVX2-NEXT: vmovaps 352(%rdi), %xmm2
5582 ; AVX2-NEXT: vbroadcastss %xmm2, %xmm1
5583 ; AVX2-NEXT: vmovaps %xmm2, %xmm14
5584 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5585 ; AVX2-NEXT: vmovaps 320(%rdi), %xmm3
5586 ; AVX2-NEXT: vbroadcastss %xmm3, %xmm2
5587 ; AVX2-NEXT: vmovaps %xmm3, %xmm15
5588 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5589 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5590 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5591 ; AVX2-NEXT: vmovaps 416(%rdi), %xmm1
5592 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5593 ; AVX2-NEXT: vmovaps 384(%rdi), %xmm12
5594 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1]
5595 ; AVX2-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5596 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
5597 ; AVX2-NEXT: vmovaps 480(%rdi), %xmm2
5598 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5599 ; AVX2-NEXT: vbroadcastss %xmm2, %xmm2
5600 ; AVX2-NEXT: vmovaps 448(%rdi), %xmm3
5601 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5602 ; AVX2-NEXT: vbroadcastss %xmm3, %xmm3
5603 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5604 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
5605 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
5606 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5607 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5608 ; AVX2-NEXT: vmovaps 800(%rdi), %xmm1
5609 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5610 ; AVX2-NEXT: vmovaps 768(%rdi), %xmm0
5611 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5612 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
5613 ; AVX2-NEXT: vmovaps 864(%rdi), %xmm1
5614 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5615 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
5616 ; AVX2-NEXT: vmovaps 832(%rdi), %xmm2
5617 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5618 ; AVX2-NEXT: vbroadcastss %xmm2, %xmm2
5619 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5620 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5621 ; AVX2-NEXT: vmovaps 992(%rdi), %xmm1
5622 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5623 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
5624 ; AVX2-NEXT: vmovaps 960(%rdi), %xmm11
5625 ; AVX2-NEXT: vbroadcastss %xmm11, %xmm2
5626 ; AVX2-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5627 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5628 ; AVX2-NEXT: vmovaps 928(%rdi), %xmm2
5629 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5630 ; AVX2-NEXT: vmovaps 896(%rdi), %xmm3
5631 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5632 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5633 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
5634 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
5635 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5636 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5637 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5638 ; AVX2-NEXT: vmovaps 608(%rdi), %xmm0
5639 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5640 ; AVX2-NEXT: vbroadcastss %xmm0, %xmm0
5641 ; AVX2-NEXT: vmovaps 576(%rdi), %xmm1
5642 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5643 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
5644 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5645 ; AVX2-NEXT: vmovaps 544(%rdi), %xmm1
5646 ; AVX2-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
5647 ; AVX2-NEXT: vmovaps 512(%rdi), %xmm2
5648 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5649 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5650 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
5651 ; AVX2-NEXT: vmovaps 736(%rdi), %xmm1
5652 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5653 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
5654 ; AVX2-NEXT: vmovaps 704(%rdi), %xmm2
5655 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5656 ; AVX2-NEXT: vbroadcastss %xmm2, %xmm2
5657 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5658 ; AVX2-NEXT: vmovaps 672(%rdi), %xmm2
5659 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5660 ; AVX2-NEXT: vmovaps 640(%rdi), %xmm3
5661 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5662 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5663 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
5664 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
5665 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5666 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5667 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5668 ; AVX2-NEXT: vmovaps 224(%rdi), %xmm0
5669 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5670 ; AVX2-NEXT: vbroadcastss %xmm0, %xmm0
5671 ; AVX2-NEXT: vmovaps 192(%rdi), %xmm1
5672 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5673 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
5674 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5675 ; AVX2-NEXT: vmovaps 160(%rdi), %xmm9
5676 ; AVX2-NEXT: vmovaps 128(%rdi), %xmm8
5677 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
5678 ; AVX2-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5679 ; AVX2-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5680 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
5681 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5682 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5683 ; AVX2-NEXT: vmovaps 96(%rdi), %xmm7
5684 ; AVX2-NEXT: vbroadcastss %xmm7, %xmm1
5685 ; AVX2-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5686 ; AVX2-NEXT: vmovaps 64(%rdi), %xmm6
5687 ; AVX2-NEXT: vbroadcastss %xmm6, %xmm2
5688 ; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5689 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5690 ; AVX2-NEXT: vmovaps (%rdi), %xmm5
5691 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm4
5692 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5693 ; AVX2-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5694 ; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5695 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3]
5696 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm13[4,5,6,7]
5697 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5698 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1]
5699 ; AVX2-NEXT: vmovaps %xmm10, %xmm3
5700 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5701 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3]
5702 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
5703 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5704 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
5705 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5706 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
5707 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
5708 ; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm12[1,1,1,1]
5709 ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
5710 ; AVX2-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
5711 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
5712 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5713 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5714 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5715 ; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
5716 ; AVX2-NEXT: # xmm0 = mem[1,1,1,1]
5717 ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5718 ; AVX2-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
5719 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5720 ; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload
5721 ; AVX2-NEXT: # xmm1 = xmm12[0],mem[0],xmm12[1],mem[1]
5722 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5723 ; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
5724 ; AVX2-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1]
5725 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
5726 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5727 ; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,1,1]
5728 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5729 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3]
5730 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
5731 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5732 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5733 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5734 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1]
5735 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3]
5736 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
5737 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5738 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5739 ; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
5740 ; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
5741 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
5742 ; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1,1,1]
5743 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3]
5744 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
5745 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5746 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5747 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5748 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5749 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1]
5750 ; AVX2-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload
5751 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
5752 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5753 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5754 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
5755 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5756 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5757 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5758 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
5759 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
5760 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5761 ; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,1,1]
5762 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5763 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3]
5764 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
5765 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5766 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5767 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5768 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3]
5769 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5770 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5771 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
5772 ; AVX2-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
5773 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5774 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2]
5775 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
5776 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5777 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
5778 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5779 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5780 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,2,2,2]
5781 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5782 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
5783 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
5784 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5785 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5786 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5787 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
5788 ; AVX2-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3]
5789 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5790 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
5791 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5792 ; AVX2-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5793 ; AVX2-NEXT: # xmm15 = mem[2,2,2,2]
5794 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5795 ; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3]
5796 ; AVX2-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
5797 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14
5798 ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
5799 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5800 ; AVX2-NEXT: vshufps {{.*#+}} xmm15 = xmm0[2,2,2,2]
5801 ; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm12[0,1,2],xmm15[3]
5802 ; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm3[0,1],xmm15[2,3]
5803 ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7]
5804 ; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5805 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm14 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
5806 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm12 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
5807 ; AVX2-NEXT: vshufps {{.*#+}} xmm11 = xmm4[2,2,2,2]
5808 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5809 ; AVX2-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3]
5810 ; AVX2-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
5811 ; AVX2-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm15
5812 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7]
5813 ; AVX2-NEXT: vshufps {{.*#+}} xmm15 = xmm9[2,2,2,2]
5814 ; AVX2-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
5815 ; AVX2-NEXT: # xmm15 = mem[0,1,2],xmm15[3]
5816 ; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm14[0,1],xmm15[2,3]
5817 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
5818 ; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5819 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5820 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload
5821 ; AVX2-NEXT: # xmm9 = xmm3[2],mem[2],xmm3[3],mem[3]
5822 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5823 ; AVX2-NEXT: vshufps {{.*#+}} xmm7 = xmm15[2,2,2,2]
5824 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
5825 ; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3]
5826 ; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
5827 ; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm11
5828 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7]
5829 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5830 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload
5831 ; AVX2-NEXT: # xmm6 = xmm3[2],mem[2],xmm3[3],mem[3]
5832 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
5833 ; AVX2-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,2,2]
5834 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
5835 ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3]
5836 ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
5837 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
5838 ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5839 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm13[2],xmm2[3],xmm13[3]
5840 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5841 ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1]
5842 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
5843 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload
5844 ; AVX2-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3]
5845 ; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
5846 ; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5847 ; AVX2-NEXT: # xmm3 = mem[2,3,2,3]
5848 ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
5849 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
5850 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
5851 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5852 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5853 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
5854 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5855 ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
5856 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
5857 ; AVX2-NEXT: # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3]
5858 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
5859 ; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5860 ; AVX2-NEXT: # xmm1 = mem[2,3,2,3]
5861 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
5862 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
5863 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5864 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5865 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5866 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5867 ; AVX2-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
5868 ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1]
5869 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload
5870 ; AVX2-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3]
5871 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
5872 ; AVX2-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3]
5873 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
5874 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5875 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5876 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5877 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm15[2],xmm4[3],xmm15[3]
5878 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
5879 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,3,2,3]
5880 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
5881 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5882 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm11[2],xmm8[3],xmm11[3]
5883 ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm1[1]
5884 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5885 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5886 ; AVX2-NEXT: vmovaps 288(%rdi), %ymm0
5887 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5888 ; AVX2-NEXT: vmovaps 256(%rdi), %ymm1
5889 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5890 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
5891 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
5892 ; AVX2-NEXT: vmovaps 352(%rdi), %ymm1
5893 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5894 ; AVX2-NEXT: vmovaps 320(%rdi), %ymm2
5895 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5896 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
5897 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,2,2,2]
5898 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5899 ; AVX2-NEXT: vmovaps 480(%rdi), %ymm2
5900 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5901 ; AVX2-NEXT: vmovaps 448(%rdi), %ymm3
5902 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5903 ; AVX2-NEXT: vmovaps 416(%rdi), %ymm8
5904 ; AVX2-NEXT: vmovaps 384(%rdi), %ymm1
5905 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5906 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[4],ymm8[4],ymm1[5],ymm8[5]
5907 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
5908 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2]
5909 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5910 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5911 ; AVX2-NEXT: vmovaps 800(%rdi), %ymm0
5912 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5913 ; AVX2-NEXT: vmovaps 768(%rdi), %ymm1
5914 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5915 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
5916 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
5917 ; AVX2-NEXT: vmovaps 864(%rdi), %ymm1
5918 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5919 ; AVX2-NEXT: vmovaps 832(%rdi), %ymm2
5920 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5921 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
5922 ; AVX2-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
5923 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
5924 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5925 ; AVX2-NEXT: vmovaps 992(%rdi), %ymm2
5926 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5927 ; AVX2-NEXT: vmovaps 960(%rdi), %ymm3
5928 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5929 ; AVX2-NEXT: vmovaps 928(%rdi), %ymm7
5930 ; AVX2-NEXT: vmovaps 896(%rdi), %ymm1
5931 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5932 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5]
5933 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5934 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
5935 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2]
5936 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5937 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5938 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm0
5939 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5940 ; AVX2-NEXT: vmovaps (%rdi), %ymm1
5941 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5942 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
5943 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
5944 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm1
5945 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5946 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm2
5947 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5948 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
5949 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2]
5950 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5951 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm2
5952 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5953 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm3
5954 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5955 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm15
5956 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm1
5957 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5958 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5]
5959 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
5960 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
5961 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5962 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5963 ; AVX2-NEXT: vmovaps 544(%rdi), %ymm14
5964 ; AVX2-NEXT: vmovaps 512(%rdi), %ymm11
5965 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[1],ymm14[1],ymm11[4],ymm14[4],ymm11[5],ymm14[5]
5966 ; AVX2-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5967 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
5968 ; AVX2-NEXT: vmovaps 608(%rdi), %ymm1
5969 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5970 ; AVX2-NEXT: vmovaps 576(%rdi), %ymm2
5971 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5972 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
5973 ; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm9[2,2,2,2]
5974 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm5[2,3]
5975 ; AVX2-NEXT: vmovaps 736(%rdi), %ymm2
5976 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5977 ; AVX2-NEXT: vmovaps 704(%rdi), %ymm3
5978 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5979 ; AVX2-NEXT: vmovaps 672(%rdi), %ymm5
5980 ; AVX2-NEXT: vmovaps 640(%rdi), %ymm0
5981 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5982 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5]
5983 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
5984 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
5985 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5986 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5987 ; AVX2-NEXT: vbroadcastss 404(%rdi), %ymm0
5988 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7]
5989 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
5990 ; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm1
5991 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
5992 ; AVX2-NEXT: vshufps {{.*#+}} ymm10 = ymm6[1,1,1,1,5,5,5,5]
5993 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5994 ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm3[1],ymm10[2,3,4],ymm3[5],ymm10[6,7]
5995 ; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm10
5996 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
5997 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5998 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5999 ; AVX2-NEXT: vbroadcastss 916(%rdi), %ymm0
6000 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
6001 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7]
6002 ; AVX2-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
6003 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1
6004 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
6005 ; AVX2-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,1,1,1,5,5,5,5]
6006 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
6007 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7]
6008 ; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm7
6009 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
6010 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6011 ; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
6012 ; AVX2-NEXT: vbroadcastss 148(%rdi), %ymm0
6013 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7]
6014 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
6015 ; AVX2-NEXT: vextractf128 $1, %ymm13, %xmm1
6016 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
6017 ; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1,1,1,5,5,5,5]
6018 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
6019 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7]
6020 ; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm4
6021 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
6022 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6023 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6024 ; AVX2-NEXT: vbroadcastss 660(%rdi), %ymm0
6025 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
6026 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
6027 ; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm1
6028 ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5]
6029 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7]
6030 ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2
6031 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
6032 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6033 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6034 ; AVX2-NEXT: vbroadcastss 504(%rdi), %ymm0
6035 ; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
6036 ; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7]
6037 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6038 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7]
6039 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7]
6040 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6041 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6042 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
6043 ; AVX2-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
6044 ; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm2
6045 ; AVX2-NEXT: vpermpd {{.*#+}} ymm3 = ymm6[2,2,2,2]
6046 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
6047 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6048 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6049 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6050 ; AVX2-NEXT: vbroadcastss 1016(%rdi), %ymm1
6051 ; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6052 ; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm1[7]
6053 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6054 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload
6055 ; AVX2-NEXT: # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
6056 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7]
6057 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6058 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload
6059 ; AVX2-NEXT: # ymm10 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
6060 ; AVX2-NEXT: vextractf128 $1, %ymm12, %xmm2
6061 ; AVX2-NEXT: vpermpd {{.*#+}} ymm4 = ymm10[2,2,2,2]
6062 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
6063 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
6064 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6065 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6066 ; AVX2-NEXT: vbroadcastss 248(%rdi), %ymm2
6067 ; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
6068 ; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7]
6069 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6070 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7]
6071 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm9 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7]
6072 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6073 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
6074 ; AVX2-NEXT: # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6075 ; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm8
6076 ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2]
6077 ; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3]
6078 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
6079 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7]
6080 ; AVX2-NEXT: vbroadcastss 760(%rdi), %ymm2
6081 ; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
6082 ; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7]
6083 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6084 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7]
6085 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
6086 ; AVX2-NEXT: # ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7]
6087 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6088 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload
6089 ; AVX2-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6090 ; AVX2-NEXT: vextractf128 $1, %ymm11, %xmm15
6091 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2]
6092 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
6093 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
6094 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
6095 ; AVX2-NEXT: vbroadcastss 220(%rdi), %ymm1
6096 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6097 ; AVX2-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6098 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3]
6099 ; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm4
6100 ; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
6101 ; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm9
6102 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3]
6103 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5,6,7]
6104 ; AVX2-NEXT: vbroadcastss 476(%rdi), %ymm1
6105 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6106 ; AVX2-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6107 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6108 ; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm1
6109 ; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
6110 ; AVX2-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7]
6111 ; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6
6112 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
6113 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6114 ; AVX2-NEXT: vbroadcastss 732(%rdi), %ymm1
6115 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6116 ; AVX2-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6117 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3]
6118 ; AVX2-NEXT: vextractf128 $1, %ymm13, %xmm5
6119 ; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm11[2,3,2,3,6,7,6,7]
6120 ; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6
6121 ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
6122 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7]
6123 ; AVX2-NEXT: vbroadcastss 988(%rdi), %ymm1
6124 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6125 ; AVX2-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6126 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
6127 ; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm3
6128 ; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm12[2,3,2,3,6,7,6,7]
6129 ; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6
6130 ; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3]
6131 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
6132 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6133 ; AVX2-NEXT: vmovaps %ymm3, 64(%rsi)
6134 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6135 ; AVX2-NEXT: vmovaps %ymm3, (%rsi)
6136 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6137 ; AVX2-NEXT: vmovaps %ymm3, 96(%rsi)
6138 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6139 ; AVX2-NEXT: vmovaps %ymm3, 32(%rsi)
6140 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6141 ; AVX2-NEXT: vmovaps %ymm3, 64(%rdx)
6142 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6143 ; AVX2-NEXT: vmovaps %ymm3, (%rdx)
6144 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6145 ; AVX2-NEXT: vmovaps %ymm3, 96(%rdx)
6146 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6147 ; AVX2-NEXT: vmovaps %ymm3, 32(%rdx)
6148 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6149 ; AVX2-NEXT: vmovaps %ymm3, 64(%rcx)
6150 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6151 ; AVX2-NEXT: vmovaps %ymm3, (%rcx)
6152 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6153 ; AVX2-NEXT: vmovaps %ymm3, 96(%rcx)
6154 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6155 ; AVX2-NEXT: vmovaps %ymm3, 32(%rcx)
6156 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6157 ; AVX2-NEXT: vmovaps %ymm3, 64(%r8)
6158 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6159 ; AVX2-NEXT: vmovaps %ymm3, (%r8)
6160 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6161 ; AVX2-NEXT: vmovaps %ymm3, 96(%r8)
6162 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6163 ; AVX2-NEXT: vmovaps %ymm3, 32(%r8)
6164 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6165 ; AVX2-NEXT: vmovaps %ymm3, 64(%r9)
6166 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6167 ; AVX2-NEXT: vmovaps %ymm3, (%r9)
6168 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6169 ; AVX2-NEXT: vmovaps %ymm3, 96(%r9)
6170 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6171 ; AVX2-NEXT: vmovaps %ymm3, 32(%r9)
6172 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
6173 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6174 ; AVX2-NEXT: vmovaps %ymm3, 64(%rax)
6175 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6176 ; AVX2-NEXT: vmovaps %ymm3, (%rax)
6177 ; AVX2-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload
6178 ; AVX2-NEXT: vmovaps %ymm3, 96(%rax)
6179 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6180 ; AVX2-NEXT: vmovaps %ymm3, 32(%rax)
6181 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
6182 ; AVX2-NEXT: vmovaps %ymm2, 64(%rax)
6183 ; AVX2-NEXT: vmovaps %ymm8, (%rax)
6184 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6185 ; AVX2-NEXT: vmovaps %ymm2, 96(%rax)
6186 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6187 ; AVX2-NEXT: vmovaps %ymm2, 32(%rax)
6188 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
6189 ; AVX2-NEXT: vmovaps %ymm1, 96(%rax)
6190 ; AVX2-NEXT: vmovaps %ymm5, 64(%rax)
6191 ; AVX2-NEXT: vmovaps %ymm0, 32(%rax)
6192 ; AVX2-NEXT: vmovaps %ymm4, (%rax)
6193 ; AVX2-NEXT: addq $1544, %rsp # imm = 0x608
6194 ; AVX2-NEXT: vzeroupper
6197 ; AVX2-FP-LABEL: load_i32_stride8_vf32:
6199 ; AVX2-FP-NEXT: subq $1544, %rsp # imm = 0x608
6200 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm0
6201 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6202 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm2
6203 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
6204 ; AVX2-FP-NEXT: vmovaps %xmm2, %xmm10
6205 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %xmm2
6206 ; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm1
6207 ; AVX2-FP-NEXT: vmovaps %xmm2, %xmm14
6208 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6209 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm3
6210 ; AVX2-FP-NEXT: vbroadcastss %xmm3, %xmm2
6211 ; AVX2-FP-NEXT: vmovaps %xmm3, %xmm15
6212 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6213 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6214 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6215 ; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm1
6216 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6217 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %xmm12
6218 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1]
6219 ; AVX2-FP-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6220 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
6221 ; AVX2-FP-NEXT: vmovaps 480(%rdi), %xmm2
6222 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6223 ; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm2
6224 ; AVX2-FP-NEXT: vmovaps 448(%rdi), %xmm3
6225 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6226 ; AVX2-FP-NEXT: vbroadcastss %xmm3, %xmm3
6227 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
6228 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
6229 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
6230 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6231 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6232 ; AVX2-FP-NEXT: vmovaps 800(%rdi), %xmm1
6233 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6234 ; AVX2-FP-NEXT: vmovaps 768(%rdi), %xmm0
6235 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6236 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
6237 ; AVX2-FP-NEXT: vmovaps 864(%rdi), %xmm1
6238 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6239 ; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1
6240 ; AVX2-FP-NEXT: vmovaps 832(%rdi), %xmm2
6241 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6242 ; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm2
6243 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6244 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6245 ; AVX2-FP-NEXT: vmovaps 992(%rdi), %xmm1
6246 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6247 ; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1
6248 ; AVX2-FP-NEXT: vmovaps 960(%rdi), %xmm11
6249 ; AVX2-FP-NEXT: vbroadcastss %xmm11, %xmm2
6250 ; AVX2-FP-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6251 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6252 ; AVX2-FP-NEXT: vmovaps 928(%rdi), %xmm2
6253 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6254 ; AVX2-FP-NEXT: vmovaps 896(%rdi), %xmm3
6255 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6256 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
6257 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
6258 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
6259 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6260 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6261 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6262 ; AVX2-FP-NEXT: vmovaps 608(%rdi), %xmm0
6263 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6264 ; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm0
6265 ; AVX2-FP-NEXT: vmovaps 576(%rdi), %xmm1
6266 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6267 ; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1
6268 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6269 ; AVX2-FP-NEXT: vmovaps 544(%rdi), %xmm1
6270 ; AVX2-FP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
6271 ; AVX2-FP-NEXT: vmovaps 512(%rdi), %xmm2
6272 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6273 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6274 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
6275 ; AVX2-FP-NEXT: vmovaps 736(%rdi), %xmm1
6276 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6277 ; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1
6278 ; AVX2-FP-NEXT: vmovaps 704(%rdi), %xmm2
6279 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6280 ; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm2
6281 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6282 ; AVX2-FP-NEXT: vmovaps 672(%rdi), %xmm2
6283 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6284 ; AVX2-FP-NEXT: vmovaps 640(%rdi), %xmm3
6285 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6286 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
6287 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
6288 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
6289 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6290 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6291 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6292 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm0
6293 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6294 ; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm0
6295 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm1
6296 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6297 ; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1
6298 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6299 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm9
6300 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm8
6301 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
6302 ; AVX2-FP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6303 ; AVX2-FP-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6304 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
6305 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
6306 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7]
6307 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm7
6308 ; AVX2-FP-NEXT: vbroadcastss %xmm7, %xmm1
6309 ; AVX2-FP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6310 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm6
6311 ; AVX2-FP-NEXT: vbroadcastss %xmm6, %xmm2
6312 ; AVX2-FP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6313 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6314 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm5
6315 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm4
6316 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
6317 ; AVX2-FP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6318 ; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6319 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3]
6320 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm13[4,5,6,7]
6321 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6322 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1]
6323 ; AVX2-FP-NEXT: vmovaps %xmm10, %xmm3
6324 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
6325 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3]
6326 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
6327 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6328 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
6329 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
6330 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
6331 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
6332 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm12[1,1,1,1]
6333 ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
6334 ; AVX2-FP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
6335 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
6336 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6337 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6338 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6339 ; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6340 ; AVX2-FP-NEXT: # xmm0 = mem[1,1,1,1]
6341 ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
6342 ; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
6343 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
6344 ; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload
6345 ; AVX2-FP-NEXT: # xmm1 = xmm12[0],mem[0],xmm12[1],mem[1]
6346 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6347 ; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
6348 ; AVX2-FP-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1]
6349 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
6350 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
6351 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,1,1]
6352 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
6353 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3]
6354 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
6355 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6356 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6357 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6358 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1]
6359 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3]
6360 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
6361 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6362 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6363 ; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
6364 ; AVX2-FP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
6365 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
6366 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1,1,1]
6367 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3]
6368 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
6369 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6370 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6371 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6372 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
6373 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1]
6374 ; AVX2-FP-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload
6375 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
6376 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
6377 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6378 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
6379 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6380 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
6381 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6382 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
6383 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
6384 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
6385 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,1,1]
6386 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
6387 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3]
6388 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
6389 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6390 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6391 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6392 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3]
6393 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6394 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6395 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
6396 ; AVX2-FP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
6397 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6398 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2]
6399 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
6400 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
6401 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
6402 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
6403 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
6404 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,2,2,2]
6405 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6406 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
6407 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
6408 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6409 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6410 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6411 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
6412 ; AVX2-FP-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3]
6413 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6414 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
6415 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6416 ; AVX2-FP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
6417 ; AVX2-FP-NEXT: # xmm15 = mem[2,2,2,2]
6418 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6419 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3]
6420 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
6421 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14
6422 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
6423 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6424 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm15 = xmm0[2,2,2,2]
6425 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm12[0,1,2],xmm15[3]
6426 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm3[0,1],xmm15[2,3]
6427 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7]
6428 ; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6429 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm14 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
6430 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm12 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
6431 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm11 = xmm4[2,2,2,2]
6432 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
6433 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3]
6434 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
6435 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm15
6436 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7]
6437 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm15 = xmm9[2,2,2,2]
6438 ; AVX2-FP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
6439 ; AVX2-FP-NEXT: # xmm15 = mem[0,1,2],xmm15[3]
6440 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm14[0,1],xmm15[2,3]
6441 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
6442 ; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6443 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
6444 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload
6445 ; AVX2-FP-NEXT: # xmm9 = xmm3[2],mem[2],xmm3[3],mem[3]
6446 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
6447 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm7 = xmm15[2,2,2,2]
6448 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
6449 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3]
6450 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
6451 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm11
6452 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7]
6453 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
6454 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload
6455 ; AVX2-FP-NEXT: # xmm6 = xmm3[2],mem[2],xmm3[3],mem[3]
6456 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
6457 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,2,2]
6458 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
6459 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3]
6460 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
6461 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
6462 ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6463 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm13[2],xmm2[3],xmm13[3]
6464 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6465 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1]
6466 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
6467 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload
6468 ; AVX2-FP-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3]
6469 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
6470 ; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
6471 ; AVX2-FP-NEXT: # xmm3 = mem[2,3,2,3]
6472 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
6473 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
6474 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
6475 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6476 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
6477 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
6478 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6479 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
6480 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
6481 ; AVX2-FP-NEXT: # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3]
6482 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
6483 ; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
6484 ; AVX2-FP-NEXT: # xmm1 = mem[2,3,2,3]
6485 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
6486 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
6487 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6488 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6489 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6490 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
6491 ; AVX2-FP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
6492 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1]
6493 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload
6494 ; AVX2-FP-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3]
6495 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
6496 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3]
6497 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
6498 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6499 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6500 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6501 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm15[2],xmm4[3],xmm15[3]
6502 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
6503 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,3,2,3]
6504 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
6505 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
6506 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm11[2],xmm8[3],xmm11[3]
6507 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm1[1]
6508 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6509 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6510 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm0
6511 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6512 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm1
6513 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6514 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
6515 ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0
6516 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm1
6517 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6518 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm2
6519 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6520 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
6521 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,2,2,2]
6522 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6523 ; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm2
6524 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6525 ; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm3
6526 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6527 ; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm8
6528 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm1
6529 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6530 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[4],ymm8[4],ymm1[5],ymm8[5]
6531 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
6532 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2]
6533 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6534 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6535 ; AVX2-FP-NEXT: vmovaps 800(%rdi), %ymm0
6536 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6537 ; AVX2-FP-NEXT: vmovaps 768(%rdi), %ymm1
6538 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6539 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
6540 ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0
6541 ; AVX2-FP-NEXT: vmovaps 864(%rdi), %ymm1
6542 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6543 ; AVX2-FP-NEXT: vmovaps 832(%rdi), %ymm2
6544 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6545 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
6546 ; AVX2-FP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
6547 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
6548 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6549 ; AVX2-FP-NEXT: vmovaps 992(%rdi), %ymm2
6550 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6551 ; AVX2-FP-NEXT: vmovaps 960(%rdi), %ymm3
6552 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6553 ; AVX2-FP-NEXT: vmovaps 928(%rdi), %ymm7
6554 ; AVX2-FP-NEXT: vmovaps 896(%rdi), %ymm1
6555 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6556 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5]
6557 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6558 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
6559 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2]
6560 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6561 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6562 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm0
6563 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6564 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm1
6565 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6566 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
6567 ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0
6568 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm1
6569 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6570 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm2
6571 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6572 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
6573 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2]
6574 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6575 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm2
6576 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6577 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm3
6578 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6579 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm15
6580 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm1
6581 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6582 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5]
6583 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
6584 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
6585 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6586 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6587 ; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm14
6588 ; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm11
6589 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[1],ymm14[1],ymm11[4],ymm14[4],ymm11[5],ymm14[5]
6590 ; AVX2-FP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6591 ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0
6592 ; AVX2-FP-NEXT: vmovaps 608(%rdi), %ymm1
6593 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6594 ; AVX2-FP-NEXT: vmovaps 576(%rdi), %ymm2
6595 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6596 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
6597 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm5 = ymm9[2,2,2,2]
6598 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm5[2,3]
6599 ; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm2
6600 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6601 ; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm3
6602 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6603 ; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm5
6604 ; AVX2-FP-NEXT: vmovaps 640(%rdi), %ymm0
6605 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6606 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5]
6607 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
6608 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
6609 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6610 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6611 ; AVX2-FP-NEXT: vbroadcastss 404(%rdi), %ymm0
6612 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7]
6613 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
6614 ; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm1
6615 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
6616 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm10 = ymm6[1,1,1,1,5,5,5,5]
6617 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6618 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm3[1],ymm10[2,3,4],ymm3[5],ymm10[6,7]
6619 ; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm10
6620 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
6621 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6622 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6623 ; AVX2-FP-NEXT: vbroadcastss 916(%rdi), %ymm0
6624 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
6625 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7]
6626 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
6627 ; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1
6628 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
6629 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,1,1,1,5,5,5,5]
6630 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
6631 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7]
6632 ; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm7
6633 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
6634 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6635 ; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
6636 ; AVX2-FP-NEXT: vbroadcastss 148(%rdi), %ymm0
6637 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7]
6638 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
6639 ; AVX2-FP-NEXT: vextractf128 $1, %ymm13, %xmm1
6640 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
6641 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1,1,1,5,5,5,5]
6642 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
6643 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7]
6644 ; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm4
6645 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
6646 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6647 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6648 ; AVX2-FP-NEXT: vbroadcastss 660(%rdi), %ymm0
6649 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
6650 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
6651 ; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm1
6652 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5]
6653 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7]
6654 ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2
6655 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
6656 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6657 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6658 ; AVX2-FP-NEXT: vbroadcastss 504(%rdi), %ymm0
6659 ; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
6660 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7]
6661 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
6662 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7]
6663 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7]
6664 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6665 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6666 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
6667 ; AVX2-FP-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
6668 ; AVX2-FP-NEXT: vextractf128 $1, %ymm3, %xmm2
6669 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm3 = ymm6[2,2,2,2]
6670 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
6671 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6672 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6673 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6674 ; AVX2-FP-NEXT: vbroadcastss 1016(%rdi), %ymm1
6675 ; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6676 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm1[7]
6677 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6678 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload
6679 ; AVX2-FP-NEXT: # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
6680 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7]
6681 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6682 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload
6683 ; AVX2-FP-NEXT: # ymm10 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
6684 ; AVX2-FP-NEXT: vextractf128 $1, %ymm12, %xmm2
6685 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm4 = ymm10[2,2,2,2]
6686 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
6687 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
6688 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6689 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6690 ; AVX2-FP-NEXT: vbroadcastss 248(%rdi), %ymm2
6691 ; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
6692 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7]
6693 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6694 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7]
6695 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm9 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7]
6696 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6697 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
6698 ; AVX2-FP-NEXT: # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6699 ; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm8
6700 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2]
6701 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3]
6702 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
6703 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7]
6704 ; AVX2-FP-NEXT: vbroadcastss 760(%rdi), %ymm2
6705 ; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
6706 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7]
6707 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6708 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7]
6709 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
6710 ; AVX2-FP-NEXT: # ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7]
6711 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
6712 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload
6713 ; AVX2-FP-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6714 ; AVX2-FP-NEXT: vextractf128 $1, %ymm11, %xmm15
6715 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2]
6716 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
6717 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
6718 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
6719 ; AVX2-FP-NEXT: vbroadcastss 220(%rdi), %ymm1
6720 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6721 ; AVX2-FP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6722 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3]
6723 ; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm4
6724 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
6725 ; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm9
6726 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3]
6727 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5,6,7]
6728 ; AVX2-FP-NEXT: vbroadcastss 476(%rdi), %ymm1
6729 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6730 ; AVX2-FP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6731 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
6732 ; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm1
6733 ; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
6734 ; AVX2-FP-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7]
6735 ; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6
6736 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
6737 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6738 ; AVX2-FP-NEXT: vbroadcastss 732(%rdi), %ymm1
6739 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6740 ; AVX2-FP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6741 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3]
6742 ; AVX2-FP-NEXT: vextractf128 $1, %ymm13, %xmm5
6743 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm11[2,3,2,3,6,7,6,7]
6744 ; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6
6745 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
6746 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7]
6747 ; AVX2-FP-NEXT: vbroadcastss 988(%rdi), %ymm1
6748 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
6749 ; AVX2-FP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
6750 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
6751 ; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm3
6752 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm12[2,3,2,3,6,7,6,7]
6753 ; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6
6754 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3]
6755 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
6756 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6757 ; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rsi)
6758 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6759 ; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi)
6760 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6761 ; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rsi)
6762 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6763 ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rsi)
6764 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6765 ; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rdx)
6766 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6767 ; AVX2-FP-NEXT: vmovaps %ymm3, (%rdx)
6768 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6769 ; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rdx)
6770 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6771 ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rdx)
6772 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6773 ; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rcx)
6774 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6775 ; AVX2-FP-NEXT: vmovaps %ymm3, (%rcx)
6776 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6777 ; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rcx)
6778 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6779 ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rcx)
6780 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6781 ; AVX2-FP-NEXT: vmovaps %ymm3, 64(%r8)
6782 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6783 ; AVX2-FP-NEXT: vmovaps %ymm3, (%r8)
6784 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6785 ; AVX2-FP-NEXT: vmovaps %ymm3, 96(%r8)
6786 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6787 ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%r8)
6788 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6789 ; AVX2-FP-NEXT: vmovaps %ymm3, 64(%r9)
6790 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6791 ; AVX2-FP-NEXT: vmovaps %ymm3, (%r9)
6792 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6793 ; AVX2-FP-NEXT: vmovaps %ymm3, 96(%r9)
6794 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6795 ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%r9)
6796 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6797 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6798 ; AVX2-FP-NEXT: vmovaps %ymm3, 64(%rax)
6799 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6800 ; AVX2-FP-NEXT: vmovaps %ymm3, (%rax)
6801 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload
6802 ; AVX2-FP-NEXT: vmovaps %ymm3, 96(%rax)
6803 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
6804 ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rax)
6805 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6806 ; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rax)
6807 ; AVX2-FP-NEXT: vmovaps %ymm8, (%rax)
6808 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6809 ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rax)
6810 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
6811 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rax)
6812 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6813 ; AVX2-FP-NEXT: vmovaps %ymm1, 96(%rax)
6814 ; AVX2-FP-NEXT: vmovaps %ymm5, 64(%rax)
6815 ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax)
6816 ; AVX2-FP-NEXT: vmovaps %ymm4, (%rax)
6817 ; AVX2-FP-NEXT: addq $1544, %rsp # imm = 0x608
6818 ; AVX2-FP-NEXT: vzeroupper
6819 ; AVX2-FP-NEXT: retq
6821 ; AVX2-FCP-LABEL: load_i32_stride8_vf32:
6822 ; AVX2-FCP: # %bb.0:
6823 ; AVX2-FCP-NEXT: subq $1544, %rsp # imm = 0x608
6824 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm0
6825 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6826 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm2
6827 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
6828 ; AVX2-FCP-NEXT: vmovaps %xmm2, %xmm10
6829 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %xmm2
6830 ; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm1
6831 ; AVX2-FCP-NEXT: vmovaps %xmm2, %xmm14
6832 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6833 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %xmm3
6834 ; AVX2-FCP-NEXT: vbroadcastss %xmm3, %xmm2
6835 ; AVX2-FCP-NEXT: vmovaps %xmm3, %xmm15
6836 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6837 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6838 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6839 ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm1
6840 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6841 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %xmm12
6842 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm12[0],xmm1[0],xmm12[1],xmm1[1]
6843 ; AVX2-FCP-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6844 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
6845 ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %xmm2
6846 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6847 ; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm2
6848 ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %xmm3
6849 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6850 ; AVX2-FCP-NEXT: vbroadcastss %xmm3, %xmm3
6851 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
6852 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
6853 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
6854 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6855 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6856 ; AVX2-FCP-NEXT: vmovaps 800(%rdi), %xmm1
6857 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6858 ; AVX2-FCP-NEXT: vmovaps 768(%rdi), %xmm0
6859 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6860 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
6861 ; AVX2-FCP-NEXT: vmovaps 864(%rdi), %xmm1
6862 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6863 ; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1
6864 ; AVX2-FCP-NEXT: vmovaps 832(%rdi), %xmm2
6865 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6866 ; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm2
6867 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6868 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6869 ; AVX2-FCP-NEXT: vmovaps 992(%rdi), %xmm1
6870 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6871 ; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1
6872 ; AVX2-FCP-NEXT: vmovaps 960(%rdi), %xmm11
6873 ; AVX2-FCP-NEXT: vbroadcastss %xmm11, %xmm2
6874 ; AVX2-FCP-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6875 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6876 ; AVX2-FCP-NEXT: vmovaps 928(%rdi), %xmm2
6877 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6878 ; AVX2-FCP-NEXT: vmovaps 896(%rdi), %xmm3
6879 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6880 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
6881 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
6882 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
6883 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6884 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6885 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6886 ; AVX2-FCP-NEXT: vmovaps 608(%rdi), %xmm0
6887 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6888 ; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm0
6889 ; AVX2-FCP-NEXT: vmovaps 576(%rdi), %xmm1
6890 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6891 ; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1
6892 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6893 ; AVX2-FCP-NEXT: vmovaps 544(%rdi), %xmm1
6894 ; AVX2-FCP-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill
6895 ; AVX2-FCP-NEXT: vmovaps 512(%rdi), %xmm2
6896 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6897 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6898 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
6899 ; AVX2-FCP-NEXT: vmovaps 736(%rdi), %xmm1
6900 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6901 ; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1
6902 ; AVX2-FCP-NEXT: vmovaps 704(%rdi), %xmm2
6903 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6904 ; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm2
6905 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6906 ; AVX2-FCP-NEXT: vmovaps 672(%rdi), %xmm2
6907 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6908 ; AVX2-FCP-NEXT: vmovaps 640(%rdi), %xmm3
6909 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6910 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
6911 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
6912 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
6913 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6914 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6915 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6916 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm0
6917 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6918 ; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm0
6919 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm1
6920 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6921 ; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1
6922 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6923 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm9
6924 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm8
6925 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm8[0],xmm9[0],xmm8[1],xmm9[1]
6926 ; AVX2-FCP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6927 ; AVX2-FCP-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6928 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
6929 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
6930 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5],ymm0[6,7]
6931 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm7
6932 ; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm1
6933 ; AVX2-FCP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6934 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm6
6935 ; AVX2-FCP-NEXT: vbroadcastss %xmm6, %xmm2
6936 ; AVX2-FCP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6937 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
6938 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm5
6939 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm4
6940 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
6941 ; AVX2-FCP-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6942 ; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6943 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3]
6944 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm13[4,5,6,7]
6945 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6946 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm10[1,1,1,1]
6947 ; AVX2-FCP-NEXT: vmovaps %xmm10, %xmm3
6948 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
6949 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3]
6950 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
6951 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6952 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
6953 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
6954 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
6955 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
6956 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm12[1,1,1,1]
6957 ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
6958 ; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
6959 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
6960 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6961 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6962 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6963 ; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
6964 ; AVX2-FCP-NEXT: # xmm0 = mem[1,1,1,1]
6965 ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
6966 ; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
6967 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
6968 ; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm1 # 16-byte Folded Reload
6969 ; AVX2-FCP-NEXT: # xmm1 = xmm12[0],mem[0],xmm12[1],mem[1]
6970 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6971 ; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
6972 ; AVX2-FCP-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1]
6973 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
6974 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
6975 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm11[1,1,1,1]
6976 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
6977 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1],xmm2[2,3]
6978 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
6979 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6980 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6981 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6982 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1]
6983 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2,3]
6984 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
6985 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6986 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6987 ; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
6988 ; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
6989 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
6990 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm8[1,1,1,1]
6991 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm9[1],xmm2[2,3]
6992 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
6993 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6994 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6995 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6996 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
6997 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1]
6998 ; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm7 # 16-byte Reload
6999 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
7000 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
7001 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7002 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
7003 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
7004 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7005 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7006 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
7007 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
7008 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
7009 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm6[1,1,1,1]
7010 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
7011 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm5[1],xmm2[2,3]
7012 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
7013 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
7014 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7015 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7016 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm3 = xmm3[2],xmm13[2],xmm3[3],xmm13[3]
7017 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7018 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7019 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
7020 ; AVX2-FCP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
7021 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7022 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2]
7023 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
7024 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
7025 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
7026 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
7027 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
7028 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,2,2,2]
7029 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7030 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
7031 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
7032 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7033 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7034 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7035 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
7036 ; AVX2-FCP-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3]
7037 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7038 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
7039 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7040 ; AVX2-FCP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
7041 ; AVX2-FCP-NEXT: # xmm15 = mem[2,2,2,2]
7042 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7043 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3]
7044 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
7045 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14
7046 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
7047 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7048 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm15 = xmm0[2,2,2,2]
7049 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm12[0,1,2],xmm15[3]
7050 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm3[0,1],xmm15[2,3]
7051 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm15[0,1,2,3],ymm14[4,5,6,7]
7052 ; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7053 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm14 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
7054 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm12 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
7055 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm11 = xmm4[2,2,2,2]
7056 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
7057 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3]
7058 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
7059 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm15
7060 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7]
7061 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm15 = xmm9[2,2,2,2]
7062 ; AVX2-FCP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
7063 ; AVX2-FCP-NEXT: # xmm15 = mem[0,1,2],xmm15[3]
7064 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm14[0,1],xmm15[2,3]
7065 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3],ymm11[4,5,6,7]
7066 ; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7067 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7068 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm9 # 16-byte Folded Reload
7069 ; AVX2-FCP-NEXT: # xmm9 = xmm3[2],mem[2],xmm3[3],mem[3]
7070 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
7071 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm7 = xmm15[2,2,2,2]
7072 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
7073 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm4[0,1,2],xmm7[3]
7074 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
7075 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm11
7076 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7]
7077 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7078 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm6 # 16-byte Folded Reload
7079 ; AVX2-FCP-NEXT: # xmm6 = xmm3[2],mem[2],xmm3[3],mem[3]
7080 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
7081 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm5 = xmm11[2,2,2,2]
7082 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
7083 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm8[0,1,2],xmm5[3]
7084 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
7085 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm7[4,5,6,7]
7086 ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7087 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm5 = xmm2[2],xmm13[2],xmm2[3],xmm13[3]
7088 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7089 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm5[1]
7090 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
7091 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm5 # 16-byte Folded Reload
7092 ; AVX2-FCP-NEXT: # xmm5 = xmm3[2],mem[2],xmm3[3],mem[3]
7093 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
7094 ; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
7095 ; AVX2-FCP-NEXT: # xmm3 = mem[2,3,2,3]
7096 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
7097 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
7098 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
7099 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7100 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
7101 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
7102 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7103 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
7104 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
7105 ; AVX2-FCP-NEXT: # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3]
7106 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
7107 ; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
7108 ; AVX2-FCP-NEXT: # xmm1 = mem[2,3,2,3]
7109 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
7110 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
7111 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7112 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7113 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7114 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
7115 ; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
7116 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm14[1],xmm0[1]
7117 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload
7118 ; AVX2-FCP-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3]
7119 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
7120 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3]
7121 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
7122 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
7123 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7124 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7125 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm15[2],xmm4[3],xmm15[3]
7126 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
7127 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,3,2,3]
7128 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
7129 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
7130 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm11[2],xmm8[3],xmm11[3]
7131 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm6[1],xmm1[1]
7132 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7133 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7134 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm0
7135 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7136 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm1
7137 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7138 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
7139 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
7140 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm1
7141 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7142 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm2
7143 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7144 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
7145 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm6[2,2,2,2]
7146 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
7147 ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm2
7148 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7149 ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm3
7150 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7151 ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm8
7152 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm1
7153 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7154 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[1],ymm8[1],ymm1[4],ymm8[4],ymm1[5],ymm8[5]
7155 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm10 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
7156 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm10[0],ymm1[2],ymm10[2]
7157 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7158 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7159 ; AVX2-FCP-NEXT: vmovaps 800(%rdi), %ymm0
7160 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7161 ; AVX2-FCP-NEXT: vmovaps 768(%rdi), %ymm1
7162 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7163 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
7164 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
7165 ; AVX2-FCP-NEXT: vmovaps 864(%rdi), %ymm1
7166 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7167 ; AVX2-FCP-NEXT: vmovaps 832(%rdi), %ymm2
7168 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7169 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
7170 ; AVX2-FCP-NEXT: vmovups %ymm1, (%rsp) # 32-byte Spill
7171 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
7172 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
7173 ; AVX2-FCP-NEXT: vmovaps 992(%rdi), %ymm2
7174 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7175 ; AVX2-FCP-NEXT: vmovaps 960(%rdi), %ymm3
7176 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7177 ; AVX2-FCP-NEXT: vmovaps 928(%rdi), %ymm7
7178 ; AVX2-FCP-NEXT: vmovaps 896(%rdi), %ymm1
7179 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7180 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[4],ymm7[4],ymm1[5],ymm7[5]
7181 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7182 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
7183 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2]
7184 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7185 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7186 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm0
7187 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7188 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm1
7189 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7190 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
7191 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
7192 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm1
7193 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7194 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm2
7195 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7196 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm13 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
7197 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2]
7198 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
7199 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm2
7200 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7201 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm3
7202 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7203 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm15
7204 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm1
7205 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7206 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5]
7207 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
7208 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[2],ymm4[2]
7209 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7210 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7211 ; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm14
7212 ; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm11
7213 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm11[0],ymm14[0],ymm11[1],ymm14[1],ymm11[4],ymm14[4],ymm11[5],ymm14[5]
7214 ; AVX2-FCP-NEXT: vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7215 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
7216 ; AVX2-FCP-NEXT: vmovaps 608(%rdi), %ymm1
7217 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7218 ; AVX2-FCP-NEXT: vmovaps 576(%rdi), %ymm2
7219 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7220 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
7221 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm5 = ymm9[2,2,2,2]
7222 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],xmm5[2,3]
7223 ; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm2
7224 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7225 ; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm3
7226 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7227 ; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm5
7228 ; AVX2-FCP-NEXT: vmovaps 640(%rdi), %ymm0
7229 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7230 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5]
7231 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
7232 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[2],ymm2[2]
7233 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7234 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7235 ; AVX2-FCP-NEXT: vbroadcastss 404(%rdi), %ymm0
7236 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm8[5],ymm0[6,7]
7237 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
7238 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm1
7239 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
7240 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm10 = ymm6[1,1,1,1,5,5,5,5]
7241 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7242 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0],ymm3[1],ymm10[2,3,4],ymm3[5],ymm10[6,7]
7243 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm10
7244 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
7245 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7246 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7247 ; AVX2-FCP-NEXT: vbroadcastss 916(%rdi), %ymm0
7248 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
7249 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm12[6,7]
7250 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm1 # 32-byte Reload
7251 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1
7252 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
7253 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm7 = ymm10[1,1,1,1,5,5,5,5]
7254 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
7255 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0],ymm12[1],ymm7[2,3,4],ymm12[5],ymm7[6,7]
7256 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm7, %xmm7
7257 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
7258 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7259 ; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
7260 ; AVX2-FCP-NEXT: vbroadcastss 148(%rdi), %ymm0
7261 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm15[5],ymm0[6,7]
7262 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
7263 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm13, %xmm1
7264 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
7265 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm4 = ymm13[1,1,1,1,5,5,5,5]
7266 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
7267 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0],ymm7[1],ymm4[2,3,4],ymm7[5],ymm4[6,7]
7268 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm4
7269 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
7270 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7271 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7272 ; AVX2-FCP-NEXT: vbroadcastss 660(%rdi), %ymm0
7273 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
7274 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
7275 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm9, %xmm1
7276 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm11[1,1,1,1,5,5,5,5]
7277 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0],ymm14[1],ymm2[2,3,4],ymm14[5],ymm2[6,7]
7278 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2
7279 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
7280 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7281 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7282 ; AVX2-FCP-NEXT: vbroadcastss 504(%rdi), %ymm0
7283 ; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
7284 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7]
7285 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
7286 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7]
7287 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm3 = ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[6],ymm3[6],ymm6[7],ymm3[7]
7288 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7289 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7290 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm6 # 32-byte Folded Reload
7291 ; AVX2-FCP-NEXT: # ymm6 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
7292 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm3, %xmm2
7293 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm3 = ymm6[2,2,2,2]
7294 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
7295 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5],ymm1[6,7]
7296 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
7297 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7298 ; AVX2-FCP-NEXT: vbroadcastss 1016(%rdi), %ymm1
7299 ; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
7300 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm1[7]
7301 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7302 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm3 # 32-byte Folded Reload
7303 ; AVX2-FCP-NEXT: # ymm3 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
7304 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm12 = ymm10[2],ymm12[2],ymm10[3],ymm12[3],ymm10[6],ymm12[6],ymm10[7],ymm12[7]
7305 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7306 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload
7307 ; AVX2-FCP-NEXT: # ymm10 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
7308 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm12, %xmm2
7309 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm4 = ymm10[2,2,2,2]
7310 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
7311 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
7312 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
7313 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
7314 ; AVX2-FCP-NEXT: vbroadcastss 248(%rdi), %ymm2
7315 ; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
7316 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7]
7317 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7318 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm4 = ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[6],ymm15[6],ymm1[7],ymm15[7]
7319 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm9 = ymm13[2],ymm7[2],ymm13[3],ymm7[3],ymm13[6],ymm7[6],ymm13[7],ymm7[7]
7320 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7321 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
7322 ; AVX2-FCP-NEXT: # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
7323 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm9, %xmm8
7324 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm14[2,2,2,2]
7325 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm8 = xmm8[0,1],xmm15[2,3]
7326 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
7327 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm2[4,5,6,7]
7328 ; AVX2-FCP-NEXT: vbroadcastss 760(%rdi), %ymm2
7329 ; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
7330 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4,5,6],ymm2[7]
7331 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7332 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm5 = ymm1[2],ymm5[2],ymm1[3],ymm5[3],ymm1[6],ymm5[6],ymm1[7],ymm5[7]
7333 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm11 # 32-byte Folded Reload
7334 ; AVX2-FCP-NEXT: # ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7]
7335 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
7336 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm13 # 32-byte Folded Reload
7337 ; AVX2-FCP-NEXT: # ymm13 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
7338 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm15
7339 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm13[2,2,2,2]
7340 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
7341 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
7342 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm2[4,5,6,7]
7343 ; AVX2-FCP-NEXT: vbroadcastss 220(%rdi), %ymm1
7344 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
7345 ; AVX2-FCP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
7346 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm4[1],ymm1[1],ymm4[3],ymm1[3]
7347 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm4
7348 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
7349 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm9, %xmm9
7350 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm9[0,1],xmm4[2,3]
7351 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm1[4,5,6,7]
7352 ; AVX2-FCP-NEXT: vbroadcastss 476(%rdi), %ymm1
7353 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
7354 ; AVX2-FCP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
7355 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
7356 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm1
7357 ; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
7358 ; AVX2-FCP-NEXT: # ymm6 = mem[2,3,2,3,6,7,6,7]
7359 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6
7360 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
7361 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
7362 ; AVX2-FCP-NEXT: vbroadcastss 732(%rdi), %ymm1
7363 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
7364 ; AVX2-FCP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
7365 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm5[1],ymm1[1],ymm5[3],ymm1[3]
7366 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm13, %xmm5
7367 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm11[2,3,2,3,6,7,6,7]
7368 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6
7369 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
7370 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm1[4,5,6,7]
7371 ; AVX2-FCP-NEXT: vbroadcastss 988(%rdi), %ymm1
7372 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
7373 ; AVX2-FCP-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
7374 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm1 = ymm3[1],ymm1[1],ymm3[3],ymm1[3]
7375 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm3
7376 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm12[2,3,2,3,6,7,6,7]
7377 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6
7378 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0,1],xmm3[2,3]
7379 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
7380 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7381 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rsi)
7382 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7383 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi)
7384 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7385 ; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rsi)
7386 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7387 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rsi)
7388 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7389 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rdx)
7390 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7391 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rdx)
7392 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7393 ; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rdx)
7394 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7395 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rdx)
7396 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7397 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rcx)
7398 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7399 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rcx)
7400 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7401 ; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rcx)
7402 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7403 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rcx)
7404 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7405 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%r8)
7406 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7407 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%r8)
7408 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7409 ; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%r8)
7410 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7411 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%r8)
7412 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7413 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%r9)
7414 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7415 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%r9)
7416 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7417 ; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%r9)
7418 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7419 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%r9)
7420 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7421 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7422 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rax)
7423 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7424 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rax)
7425 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload
7426 ; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rax)
7427 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
7428 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rax)
7429 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7430 ; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rax)
7431 ; AVX2-FCP-NEXT: vmovaps %ymm8, (%rax)
7432 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7433 ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rax)
7434 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
7435 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rax)
7436 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7437 ; AVX2-FCP-NEXT: vmovaps %ymm1, 96(%rax)
7438 ; AVX2-FCP-NEXT: vmovaps %ymm5, 64(%rax)
7439 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax)
7440 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rax)
7441 ; AVX2-FCP-NEXT: addq $1544, %rsp # imm = 0x608
7442 ; AVX2-FCP-NEXT: vzeroupper
7443 ; AVX2-FCP-NEXT: retq
7445 ; AVX512-LABEL: load_i32_stride8_vf32:
7447 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
7448 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
7449 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
7450 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
7451 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm29
7452 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1
7453 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm30
7454 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm31
7455 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm3
7456 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm7
7457 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm6
7458 ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm9
7459 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm5
7460 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm12
7461 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm2
7462 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm14
7463 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm11
7464 ; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm16
7465 ; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm15
7466 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
7467 ; AVX512-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7468 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm17
7469 ; AVX512-NEXT: vpermt2d %zmm16, %zmm13, %zmm17
7470 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm18
7471 ; AVX512-NEXT: vpermt2d %zmm14, %zmm13, %zmm18
7472 ; AVX512-NEXT: movb $-64, %dil
7473 ; AVX512-NEXT: kmovw %edi, %k1
7474 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1}
7475 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10
7476 ; AVX512-NEXT: vpermt2d %zmm12, %zmm13, %zmm10
7477 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm8
7478 ; AVX512-NEXT: vpermt2d %zmm9, %zmm13, %zmm8
7479 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
7480 ; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28
7481 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8
7482 ; AVX512-NEXT: vpermt2d %zmm7, %zmm13, %zmm8
7483 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10
7484 ; AVX512-NEXT: vpermt2d %zmm31, %zmm13, %zmm10
7485 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7486 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8
7487 ; AVX512-NEXT: vpermt2d %zmm30, %zmm13, %zmm8
7488 ; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm13
7489 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7490 ; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17
7491 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
7492 ; AVX512-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7493 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm10
7494 ; AVX512-NEXT: vpermt2d %zmm16, %zmm8, %zmm10
7495 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm13
7496 ; AVX512-NEXT: vpermt2d %zmm14, %zmm8, %zmm13
7497 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
7498 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm10
7499 ; AVX512-NEXT: vpermt2d %zmm12, %zmm8, %zmm10
7500 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm4
7501 ; AVX512-NEXT: vpermt2d %zmm9, %zmm8, %zmm4
7502 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
7503 ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18
7504 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm4
7505 ; AVX512-NEXT: vpermt2d %zmm7, %zmm8, %zmm4
7506 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10
7507 ; AVX512-NEXT: vpermt2d %zmm31, %zmm8, %zmm10
7508 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1}
7509 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4
7510 ; AVX512-NEXT: vpermt2d %zmm30, %zmm8, %zmm4
7511 ; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm8
7512 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
7513 ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19
7514 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
7515 ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7516 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm8
7517 ; AVX512-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
7518 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm10
7519 ; AVX512-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
7520 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7521 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8
7522 ; AVX512-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
7523 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13
7524 ; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
7525 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7526 ; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20
7527 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8
7528 ; AVX512-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
7529 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10
7530 ; AVX512-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
7531 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7532 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8
7533 ; AVX512-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
7534 ; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
7535 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
7536 ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21
7537 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
7538 ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7539 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm8
7540 ; AVX512-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
7541 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm10
7542 ; AVX512-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
7543 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7544 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8
7545 ; AVX512-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
7546 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13
7547 ; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
7548 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7549 ; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22
7550 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8
7551 ; AVX512-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
7552 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10
7553 ; AVX512-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
7554 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7555 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8
7556 ; AVX512-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
7557 ; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
7558 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
7559 ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23
7560 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
7561 ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7562 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm8
7563 ; AVX512-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
7564 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm10
7565 ; AVX512-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
7566 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7567 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8
7568 ; AVX512-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
7569 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13
7570 ; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
7571 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7572 ; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24
7573 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8
7574 ; AVX512-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
7575 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10
7576 ; AVX512-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
7577 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7578 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8
7579 ; AVX512-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
7580 ; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
7581 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
7582 ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25
7583 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
7584 ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7585 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm8
7586 ; AVX512-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
7587 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm10
7588 ; AVX512-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
7589 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7590 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8
7591 ; AVX512-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
7592 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13
7593 ; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
7594 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7595 ; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26
7596 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm8
7597 ; AVX512-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
7598 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10
7599 ; AVX512-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
7600 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7601 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm8
7602 ; AVX512-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
7603 ; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
7604 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
7605 ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27
7606 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
7607 ; AVX512-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7608 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm8
7609 ; AVX512-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
7610 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm10
7611 ; AVX512-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
7612 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7613 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8
7614 ; AVX512-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
7615 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13
7616 ; AVX512-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
7617 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7618 ; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
7619 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm10
7620 ; AVX512-NEXT: vpermt2d %zmm7, %zmm4, %zmm10
7621 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm13
7622 ; AVX512-NEXT: vpermt2d %zmm31, %zmm4, %zmm13
7623 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
7624 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm10
7625 ; AVX512-NEXT: vpermt2d %zmm30, %zmm4, %zmm10
7626 ; AVX512-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
7627 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
7628 ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4
7629 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
7630 ; AVX512-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7631 ; AVX512-NEXT: vpermt2d %zmm16, %zmm10, %zmm15
7632 ; AVX512-NEXT: vpermt2d %zmm14, %zmm10, %zmm11
7633 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1}
7634 ; AVX512-NEXT: vpermt2d %zmm12, %zmm10, %zmm2
7635 ; AVX512-NEXT: vpermt2d %zmm9, %zmm10, %zmm5
7636 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
7637 ; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
7638 ; AVX512-NEXT: vpermt2d %zmm7, %zmm10, %zmm6
7639 ; AVX512-NEXT: vpermt2d %zmm31, %zmm10, %zmm3
7640 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
7641 ; AVX512-NEXT: vpermt2d %zmm30, %zmm10, %zmm1
7642 ; AVX512-NEXT: vpermt2d %zmm29, %zmm10, %zmm0
7643 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7644 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
7645 ; AVX512-NEXT: vmovdqa64 %zmm28, 64(%rsi)
7646 ; AVX512-NEXT: vmovdqa64 %zmm17, (%rsi)
7647 ; AVX512-NEXT: vmovdqa64 %zmm18, 64(%rdx)
7648 ; AVX512-NEXT: vmovdqa64 %zmm19, (%rdx)
7649 ; AVX512-NEXT: vmovdqa64 %zmm20, 64(%rcx)
7650 ; AVX512-NEXT: vmovdqa64 %zmm21, (%rcx)
7651 ; AVX512-NEXT: vmovdqa64 %zmm22, 64(%r8)
7652 ; AVX512-NEXT: vmovdqa64 %zmm23, (%r8)
7653 ; AVX512-NEXT: vmovdqa64 %zmm24, 64(%r9)
7654 ; AVX512-NEXT: vmovdqa64 %zmm25, (%r9)
7655 ; AVX512-NEXT: vmovdqa64 %zmm26, 64(%r11)
7656 ; AVX512-NEXT: vmovdqa64 %zmm27, (%r11)
7657 ; AVX512-NEXT: vmovdqa64 %zmm8, 64(%r10)
7658 ; AVX512-NEXT: vmovdqa64 %zmm4, (%r10)
7659 ; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rax)
7660 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rax)
7661 ; AVX512-NEXT: vzeroupper
7664 ; AVX512-FCP-LABEL: load_i32_stride8_vf32:
7665 ; AVX512-FCP: # %bb.0:
7666 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7667 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
7668 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
7669 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
7670 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29
7671 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
7672 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30
7673 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31
7674 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3
7675 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
7676 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6
7677 ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9
7678 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5
7679 ; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12
7680 ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2
7681 ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14
7682 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11
7683 ; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm16
7684 ; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm15
7685 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
7686 ; AVX512-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7687 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm17
7688 ; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm13, %zmm17
7689 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm18
7690 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm18
7691 ; AVX512-FCP-NEXT: movb $-64, %dil
7692 ; AVX512-FCP-NEXT: kmovw %edi, %k1
7693 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1}
7694 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
7695 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm13, %zmm10
7696 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm8
7697 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm13, %zmm8
7698 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
7699 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28
7700 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
7701 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm8
7702 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
7703 ; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm13, %zmm10
7704 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7705 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
7706 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm13, %zmm8
7707 ; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm13
7708 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7709 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17
7710 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
7711 ; AVX512-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7712 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm10
7713 ; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm8, %zmm10
7714 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm13
7715 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm8, %zmm13
7716 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
7717 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
7718 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm8, %zmm10
7719 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm4
7720 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm8, %zmm4
7721 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
7722 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18
7723 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm4
7724 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm4
7725 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
7726 ; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm8, %zmm10
7727 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1}
7728 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
7729 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm8, %zmm4
7730 ; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm8
7731 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
7732 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19
7733 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
7734 ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7735 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
7736 ; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
7737 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
7738 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
7739 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7740 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
7741 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
7742 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
7743 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
7744 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7745 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20
7746 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
7747 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
7748 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
7749 ; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
7750 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7751 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
7752 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
7753 ; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
7754 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
7755 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21
7756 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
7757 ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7758 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
7759 ; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
7760 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
7761 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
7762 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7763 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
7764 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
7765 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
7766 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
7767 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7768 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22
7769 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
7770 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
7771 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
7772 ; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
7773 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7774 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
7775 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
7776 ; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
7777 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
7778 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23
7779 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
7780 ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7781 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
7782 ; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
7783 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
7784 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
7785 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7786 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
7787 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
7788 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
7789 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
7790 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7791 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24
7792 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
7793 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
7794 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
7795 ; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
7796 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7797 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
7798 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
7799 ; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
7800 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
7801 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25
7802 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
7803 ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7804 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
7805 ; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
7806 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
7807 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
7808 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7809 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
7810 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
7811 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
7812 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
7813 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7814 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26
7815 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
7816 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
7817 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
7818 ; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
7819 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7820 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
7821 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
7822 ; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
7823 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
7824 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27
7825 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
7826 ; AVX512-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7827 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
7828 ; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
7829 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
7830 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
7831 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7832 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
7833 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
7834 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
7835 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
7836 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7837 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
7838 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm10
7839 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm10
7840 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
7841 ; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm13
7842 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
7843 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
7844 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm10
7845 ; AVX512-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
7846 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
7847 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4
7848 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
7849 ; AVX512-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7850 ; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm15
7851 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm10, %zmm11
7852 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1}
7853 ; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm2
7854 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm10, %zmm5
7855 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
7856 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
7857 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm6
7858 ; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm10, %zmm3
7859 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
7860 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm10, %zmm1
7861 ; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm0
7862 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7863 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
7864 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, 64(%rsi)
7865 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, (%rsi)
7866 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 64(%rdx)
7867 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, (%rdx)
7868 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 64(%rcx)
7869 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, (%rcx)
7870 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8)
7871 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%r8)
7872 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 64(%r9)
7873 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, (%r9)
7874 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 64(%r11)
7875 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, (%r11)
7876 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 64(%r10)
7877 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%r10)
7878 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax)
7879 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
7880 ; AVX512-FCP-NEXT: vzeroupper
7881 ; AVX512-FCP-NEXT: retq
7883 ; AVX512DQ-LABEL: load_i32_stride8_vf32:
7884 ; AVX512DQ: # %bb.0:
7885 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
7886 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
7887 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
7888 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
7889 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm29
7890 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm1
7891 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm30
7892 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm31
7893 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm3
7894 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm7
7895 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm6
7896 ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm9
7897 ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm5
7898 ; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm12
7899 ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm2
7900 ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm14
7901 ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm11
7902 ; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm16
7903 ; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm15
7904 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
7905 ; AVX512DQ-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7906 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm17
7907 ; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm13, %zmm17
7908 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm18
7909 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm13, %zmm18
7910 ; AVX512DQ-NEXT: movb $-64, %dil
7911 ; AVX512DQ-NEXT: kmovw %edi, %k1
7912 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1}
7913 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm10
7914 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm13, %zmm10
7915 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm8
7916 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm13, %zmm8
7917 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
7918 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28
7919 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8
7920 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm13, %zmm8
7921 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10
7922 ; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm13, %zmm10
7923 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7924 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8
7925 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm13, %zmm8
7926 ; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm13
7927 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7928 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17
7929 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
7930 ; AVX512DQ-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7931 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm10
7932 ; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm8, %zmm10
7933 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm13
7934 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm8, %zmm13
7935 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
7936 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm10
7937 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm8, %zmm10
7938 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm4
7939 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm8, %zmm4
7940 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
7941 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18
7942 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm4
7943 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm8, %zmm4
7944 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10
7945 ; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm8, %zmm10
7946 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1}
7947 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4
7948 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm8, %zmm4
7949 ; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm8
7950 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
7951 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19
7952 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
7953 ; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7954 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm8
7955 ; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
7956 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm10
7957 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
7958 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7959 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8
7960 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
7961 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13
7962 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
7963 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7964 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20
7965 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8
7966 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
7967 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10
7968 ; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
7969 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7970 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8
7971 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
7972 ; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
7973 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
7974 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21
7975 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
7976 ; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7977 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm8
7978 ; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
7979 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm10
7980 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
7981 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7982 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8
7983 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
7984 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13
7985 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
7986 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
7987 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22
7988 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8
7989 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
7990 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10
7991 ; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
7992 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
7993 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8
7994 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
7995 ; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
7996 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
7997 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23
7998 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
7999 ; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8000 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm8
8001 ; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
8002 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm10
8003 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
8004 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8005 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8
8006 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
8007 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13
8008 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
8009 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8010 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24
8011 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8
8012 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
8013 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10
8014 ; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
8015 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8016 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8
8017 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
8018 ; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
8019 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8020 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25
8021 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
8022 ; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8023 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm8
8024 ; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
8025 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm10
8026 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
8027 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8028 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8
8029 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
8030 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13
8031 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
8032 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8033 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26
8034 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm8
8035 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
8036 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10
8037 ; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
8038 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8039 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm8
8040 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
8041 ; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
8042 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8043 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27
8044 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
8045 ; AVX512DQ-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8046 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm8
8047 ; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
8048 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm10
8049 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
8050 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8051 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8
8052 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
8053 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13
8054 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
8055 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8056 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
8057 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm10
8058 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm4, %zmm10
8059 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm13
8060 ; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm4, %zmm13
8061 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
8062 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm10
8063 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm4, %zmm10
8064 ; AVX512DQ-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
8065 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
8066 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4
8067 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
8068 ; AVX512DQ-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8069 ; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm10, %zmm15
8070 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm10, %zmm11
8071 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1}
8072 ; AVX512DQ-NEXT: vpermt2d %zmm12, %zmm10, %zmm2
8073 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm10, %zmm5
8074 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
8075 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
8076 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm10, %zmm6
8077 ; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm10, %zmm3
8078 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
8079 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm10, %zmm1
8080 ; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm10, %zmm0
8081 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
8082 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
8083 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, 64(%rsi)
8084 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, (%rsi)
8085 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, 64(%rdx)
8086 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, (%rdx)
8087 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, 64(%rcx)
8088 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, (%rcx)
8089 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, 64(%r8)
8090 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, (%r8)
8091 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, 64(%r9)
8092 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, (%r9)
8093 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, 64(%r11)
8094 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, (%r11)
8095 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, 64(%r10)
8096 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, (%r10)
8097 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rax)
8098 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax)
8099 ; AVX512DQ-NEXT: vzeroupper
8100 ; AVX512DQ-NEXT: retq
8102 ; AVX512DQ-FCP-LABEL: load_i32_stride8_vf32:
8103 ; AVX512DQ-FCP: # %bb.0:
8104 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
8105 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
8106 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
8107 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
8108 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29
8109 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
8110 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30
8111 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31
8112 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3
8113 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
8114 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6
8115 ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9
8116 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5
8117 ; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12
8118 ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2
8119 ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14
8120 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11
8121 ; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm16
8122 ; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm15
8123 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
8124 ; AVX512DQ-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8125 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm17
8126 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm13, %zmm17
8127 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm18
8128 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm18
8129 ; AVX512DQ-FCP-NEXT: movb $-64, %dil
8130 ; AVX512DQ-FCP-NEXT: kmovw %edi, %k1
8131 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1}
8132 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
8133 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm13, %zmm10
8134 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm8
8135 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm13, %zmm8
8136 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
8137 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28
8138 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
8139 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm8
8140 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
8141 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm13, %zmm10
8142 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8143 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
8144 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm13, %zmm8
8145 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm13
8146 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8147 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17
8148 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
8149 ; AVX512DQ-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8150 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm10
8151 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm8, %zmm10
8152 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm13
8153 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm8, %zmm13
8154 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
8155 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
8156 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm8, %zmm10
8157 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm4
8158 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm8, %zmm4
8159 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
8160 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18
8161 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm4
8162 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm4
8163 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
8164 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm8, %zmm10
8165 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1}
8166 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
8167 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm8, %zmm4
8168 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm8
8169 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
8170 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19
8171 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
8172 ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8173 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
8174 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
8175 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
8176 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
8177 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8178 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
8179 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
8180 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
8181 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
8182 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8183 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20
8184 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
8185 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
8186 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
8187 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
8188 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8189 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
8190 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
8191 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
8192 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8193 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21
8194 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
8195 ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8196 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
8197 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
8198 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
8199 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
8200 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8201 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
8202 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
8203 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
8204 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
8205 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8206 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22
8207 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
8208 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
8209 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
8210 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
8211 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8212 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
8213 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
8214 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
8215 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8216 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23
8217 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
8218 ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8219 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
8220 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
8221 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
8222 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
8223 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8224 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
8225 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
8226 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
8227 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
8228 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8229 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24
8230 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
8231 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
8232 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
8233 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
8234 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8235 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
8236 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
8237 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
8238 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8239 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25
8240 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
8241 ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8242 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
8243 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
8244 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
8245 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
8246 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8247 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
8248 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
8249 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
8250 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
8251 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8252 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26
8253 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
8254 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
8255 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
8256 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
8257 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8258 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
8259 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
8260 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
8261 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8262 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27
8263 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
8264 ; AVX512DQ-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8265 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
8266 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
8267 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
8268 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
8269 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8270 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
8271 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
8272 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
8273 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
8274 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8275 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
8276 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm10
8277 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm10
8278 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
8279 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm13
8280 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
8281 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
8282 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm10
8283 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
8284 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
8285 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4
8286 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
8287 ; AVX512DQ-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8288 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm15
8289 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm10, %zmm11
8290 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1}
8291 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm2
8292 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm10, %zmm5
8293 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
8294 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
8295 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm6
8296 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm10, %zmm3
8297 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
8298 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm10, %zmm1
8299 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm0
8300 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
8301 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
8302 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, 64(%rsi)
8303 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, (%rsi)
8304 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 64(%rdx)
8305 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, (%rdx)
8306 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 64(%rcx)
8307 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, (%rcx)
8308 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8)
8309 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%r8)
8310 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 64(%r9)
8311 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, (%r9)
8312 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 64(%r11)
8313 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, (%r11)
8314 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 64(%r10)
8315 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%r10)
8316 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax)
8317 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
8318 ; AVX512DQ-FCP-NEXT: vzeroupper
8319 ; AVX512DQ-FCP-NEXT: retq
8321 ; AVX512BW-LABEL: load_i32_stride8_vf32:
8322 ; AVX512BW: # %bb.0:
8323 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
8324 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
8325 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
8326 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
8327 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm29
8328 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1
8329 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm30
8330 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm31
8331 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3
8332 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7
8333 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6
8334 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9
8335 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm5
8336 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm12
8337 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2
8338 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm14
8339 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm11
8340 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm16
8341 ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm15
8342 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
8343 ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8344 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17
8345 ; AVX512BW-NEXT: vpermt2d %zmm16, %zmm13, %zmm17
8346 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm18
8347 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm13, %zmm18
8348 ; AVX512BW-NEXT: movb $-64, %dil
8349 ; AVX512BW-NEXT: kmovd %edi, %k1
8350 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1}
8351 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10
8352 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm13, %zmm10
8353 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8
8354 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm13, %zmm8
8355 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
8356 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28
8357 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8
8358 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm8
8359 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
8360 ; AVX512BW-NEXT: vpermt2d %zmm31, %zmm13, %zmm10
8361 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8362 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8
8363 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm13, %zmm8
8364 ; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm13
8365 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8366 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17
8367 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
8368 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8369 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm10
8370 ; AVX512BW-NEXT: vpermt2d %zmm16, %zmm8, %zmm10
8371 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13
8372 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm8, %zmm13
8373 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
8374 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10
8375 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm8, %zmm10
8376 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4
8377 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm8, %zmm4
8378 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
8379 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18
8380 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4
8381 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm4
8382 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
8383 ; AVX512BW-NEXT: vpermt2d %zmm31, %zmm8, %zmm10
8384 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1}
8385 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4
8386 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm8, %zmm4
8387 ; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm8
8388 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
8389 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19
8390 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
8391 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8392 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8
8393 ; AVX512BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
8394 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10
8395 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
8396 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8397 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8
8398 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
8399 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13
8400 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
8401 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8402 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20
8403 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8
8404 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
8405 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
8406 ; AVX512BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
8407 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8408 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8
8409 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
8410 ; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
8411 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8412 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21
8413 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
8414 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8415 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8
8416 ; AVX512BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
8417 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10
8418 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
8419 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8420 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8
8421 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
8422 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13
8423 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
8424 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8425 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22
8426 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8
8427 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
8428 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
8429 ; AVX512BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
8430 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8431 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8
8432 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
8433 ; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
8434 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8435 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23
8436 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
8437 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8438 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8
8439 ; AVX512BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
8440 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10
8441 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
8442 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8443 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8
8444 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
8445 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13
8446 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
8447 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8448 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24
8449 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8
8450 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
8451 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
8452 ; AVX512BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
8453 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8454 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8
8455 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
8456 ; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
8457 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8458 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25
8459 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
8460 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8461 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8
8462 ; AVX512BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
8463 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10
8464 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
8465 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8466 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8
8467 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
8468 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13
8469 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
8470 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8471 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26
8472 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8
8473 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
8474 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
8475 ; AVX512BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
8476 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8477 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8
8478 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
8479 ; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
8480 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8481 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27
8482 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
8483 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8484 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8
8485 ; AVX512BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
8486 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10
8487 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
8488 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8489 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8
8490 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
8491 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13
8492 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
8493 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8494 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
8495 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10
8496 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm10
8497 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13
8498 ; AVX512BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm13
8499 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
8500 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10
8501 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm10
8502 ; AVX512BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
8503 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
8504 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4
8505 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
8506 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8507 ; AVX512BW-NEXT: vpermt2d %zmm16, %zmm10, %zmm15
8508 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm10, %zmm11
8509 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1}
8510 ; AVX512BW-NEXT: vpermt2d %zmm12, %zmm10, %zmm2
8511 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm10, %zmm5
8512 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
8513 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
8514 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm6
8515 ; AVX512BW-NEXT: vpermt2d %zmm31, %zmm10, %zmm3
8516 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
8517 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm10, %zmm1
8518 ; AVX512BW-NEXT: vpermt2d %zmm29, %zmm10, %zmm0
8519 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
8520 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
8521 ; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rsi)
8522 ; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rsi)
8523 ; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rdx)
8524 ; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rdx)
8525 ; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rcx)
8526 ; AVX512BW-NEXT: vmovdqa64 %zmm21, (%rcx)
8527 ; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8)
8528 ; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r8)
8529 ; AVX512BW-NEXT: vmovdqa64 %zmm24, 64(%r9)
8530 ; AVX512BW-NEXT: vmovdqa64 %zmm25, (%r9)
8531 ; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%r11)
8532 ; AVX512BW-NEXT: vmovdqa64 %zmm27, (%r11)
8533 ; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%r10)
8534 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r10)
8535 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax)
8536 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax)
8537 ; AVX512BW-NEXT: vzeroupper
8538 ; AVX512BW-NEXT: retq
8540 ; AVX512BW-FCP-LABEL: load_i32_stride8_vf32:
8541 ; AVX512BW-FCP: # %bb.0:
8542 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
8543 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
8544 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
8545 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
8546 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29
8547 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
8548 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30
8549 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31
8550 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3
8551 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
8552 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6
8553 ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9
8554 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5
8555 ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12
8556 ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2
8557 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14
8558 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11
8559 ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm16
8560 ; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm15
8561 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
8562 ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8563 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17
8564 ; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm13, %zmm17
8565 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18
8566 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm18
8567 ; AVX512BW-FCP-NEXT: movb $-64, %dil
8568 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1
8569 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1}
8570 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
8571 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm13, %zmm10
8572 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8
8573 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm13, %zmm8
8574 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
8575 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28
8576 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
8577 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm8
8578 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
8579 ; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm13, %zmm10
8580 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8581 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
8582 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm13, %zmm8
8583 ; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm13
8584 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8585 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17
8586 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
8587 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8588 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10
8589 ; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm8, %zmm10
8590 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13
8591 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm8, %zmm13
8592 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
8593 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
8594 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm8, %zmm10
8595 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4
8596 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm8, %zmm4
8597 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
8598 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18
8599 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4
8600 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm4
8601 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
8602 ; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm8, %zmm10
8603 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1}
8604 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
8605 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm8, %zmm4
8606 ; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm8
8607 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
8608 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19
8609 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
8610 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8611 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
8612 ; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
8613 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
8614 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
8615 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8616 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
8617 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
8618 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
8619 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
8620 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8621 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20
8622 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
8623 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
8624 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
8625 ; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
8626 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8627 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
8628 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
8629 ; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
8630 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8631 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21
8632 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
8633 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8634 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
8635 ; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
8636 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
8637 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
8638 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8639 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
8640 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
8641 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
8642 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
8643 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8644 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22
8645 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
8646 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
8647 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
8648 ; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
8649 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8650 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
8651 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
8652 ; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
8653 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8654 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23
8655 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
8656 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8657 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
8658 ; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
8659 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
8660 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
8661 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8662 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
8663 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
8664 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
8665 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
8666 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8667 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24
8668 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
8669 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
8670 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
8671 ; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
8672 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8673 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
8674 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
8675 ; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
8676 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8677 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25
8678 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
8679 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8680 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
8681 ; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
8682 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
8683 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
8684 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8685 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
8686 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
8687 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
8688 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
8689 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8690 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26
8691 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
8692 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
8693 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
8694 ; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
8695 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8696 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
8697 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
8698 ; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
8699 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8700 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27
8701 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
8702 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8703 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
8704 ; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
8705 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
8706 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
8707 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8708 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
8709 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
8710 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
8711 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
8712 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8713 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
8714 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10
8715 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm10
8716 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
8717 ; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm13
8718 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
8719 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
8720 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm10
8721 ; AVX512BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
8722 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
8723 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4
8724 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
8725 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8726 ; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm15
8727 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm10, %zmm11
8728 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1}
8729 ; AVX512BW-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm2
8730 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm10, %zmm5
8731 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
8732 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
8733 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm6
8734 ; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm10, %zmm3
8735 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
8736 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm10, %zmm1
8737 ; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm0
8738 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
8739 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
8740 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 64(%rsi)
8741 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%rsi)
8742 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rdx)
8743 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx)
8744 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rcx)
8745 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, (%rcx)
8746 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8)
8747 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, (%r8)
8748 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%r9)
8749 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, (%r9)
8750 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%r11)
8751 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, (%r11)
8752 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%r10)
8753 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10)
8754 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax)
8755 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
8756 ; AVX512BW-FCP-NEXT: vzeroupper
8757 ; AVX512BW-FCP-NEXT: retq
8759 ; AVX512DQ-BW-LABEL: load_i32_stride8_vf32:
8760 ; AVX512DQ-BW: # %bb.0:
8761 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
8762 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
8763 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
8764 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
8765 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm29
8766 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1
8767 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm30
8768 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm31
8769 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm3
8770 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm7
8771 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm6
8772 ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm9
8773 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm5
8774 ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm12
8775 ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm2
8776 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm14
8777 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm11
8778 ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm16
8779 ; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm15
8780 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
8781 ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8782 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm17
8783 ; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm13, %zmm17
8784 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm18
8785 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm13, %zmm18
8786 ; AVX512DQ-BW-NEXT: movb $-64, %dil
8787 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1
8788 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1}
8789 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10
8790 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm13, %zmm10
8791 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm8
8792 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm13, %zmm8
8793 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
8794 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28
8795 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8
8796 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm13, %zmm8
8797 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10
8798 ; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm13, %zmm10
8799 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8800 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8
8801 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm13, %zmm8
8802 ; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm13
8803 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8804 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17
8805 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
8806 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8807 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm10
8808 ; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm8, %zmm10
8809 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm13
8810 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm8, %zmm13
8811 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
8812 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10
8813 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm8, %zmm10
8814 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm4
8815 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm8, %zmm4
8816 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
8817 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18
8818 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm4
8819 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm8, %zmm4
8820 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10
8821 ; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm8, %zmm10
8822 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1}
8823 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4
8824 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm8, %zmm4
8825 ; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm8
8826 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
8827 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19
8828 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
8829 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8830 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8
8831 ; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
8832 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10
8833 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
8834 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8835 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8
8836 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
8837 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13
8838 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
8839 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8840 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20
8841 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8
8842 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
8843 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10
8844 ; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
8845 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8846 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8
8847 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
8848 ; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
8849 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8850 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21
8851 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
8852 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8853 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8
8854 ; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
8855 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10
8856 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
8857 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8858 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8
8859 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
8860 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13
8861 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
8862 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8863 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22
8864 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8
8865 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
8866 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10
8867 ; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
8868 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8869 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8
8870 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
8871 ; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
8872 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8873 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23
8874 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
8875 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8876 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8
8877 ; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
8878 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10
8879 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
8880 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8881 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8
8882 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
8883 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13
8884 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
8885 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8886 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24
8887 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8
8888 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
8889 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10
8890 ; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
8891 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8892 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8
8893 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
8894 ; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
8895 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8896 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25
8897 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
8898 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8899 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8
8900 ; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
8901 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10
8902 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
8903 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8904 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8
8905 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
8906 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13
8907 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
8908 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8909 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26
8910 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8
8911 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
8912 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10
8913 ; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
8914 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8915 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8
8916 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
8917 ; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
8918 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
8919 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27
8920 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
8921 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8922 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8
8923 ; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
8924 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10
8925 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
8926 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
8927 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8
8928 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
8929 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13
8930 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
8931 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
8932 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
8933 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10
8934 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm4, %zmm10
8935 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13
8936 ; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm4, %zmm13
8937 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
8938 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10
8939 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm4, %zmm10
8940 ; AVX512DQ-BW-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
8941 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
8942 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4
8943 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
8944 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
8945 ; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm10, %zmm15
8946 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm10, %zmm11
8947 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1}
8948 ; AVX512DQ-BW-NEXT: vpermt2d %zmm12, %zmm10, %zmm2
8949 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm10, %zmm5
8950 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
8951 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
8952 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm10, %zmm6
8953 ; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm10, %zmm3
8954 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
8955 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm10, %zmm1
8956 ; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm10, %zmm0
8957 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
8958 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
8959 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 64(%rsi)
8960 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%rsi)
8961 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 64(%rdx)
8962 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rdx)
8963 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 64(%rcx)
8964 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, (%rcx)
8965 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%r8)
8966 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, (%r8)
8967 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 64(%r9)
8968 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, (%r9)
8969 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 64(%r11)
8970 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, (%r11)
8971 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 64(%r10)
8972 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%r10)
8973 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rax)
8974 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax)
8975 ; AVX512DQ-BW-NEXT: vzeroupper
8976 ; AVX512DQ-BW-NEXT: retq
8978 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride8_vf32:
8979 ; AVX512DQ-BW-FCP: # %bb.0:
8980 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
8981 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
8982 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
8983 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
8984 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29
8985 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
8986 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30
8987 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31
8988 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3
8989 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
8990 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6
8991 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9
8992 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5
8993 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12
8994 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2
8995 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14
8996 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11
8997 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm16
8998 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm15
8999 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
9000 ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9001 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17
9002 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm13, %zmm17
9003 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18
9004 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm13, %zmm18
9005 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %dil
9006 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1
9007 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1}
9008 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
9009 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm13, %zmm10
9010 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8
9011 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm13, %zmm8
9012 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
9013 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28
9014 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
9015 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm13, %zmm8
9016 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
9017 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm13, %zmm10
9018 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
9019 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
9020 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm13, %zmm8
9021 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm13
9022 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
9023 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17
9024 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
9025 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9026 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10
9027 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm8, %zmm10
9028 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13
9029 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm8, %zmm13
9030 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
9031 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
9032 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm8, %zmm10
9033 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4
9034 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm8, %zmm4
9035 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
9036 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18
9037 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4
9038 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm8, %zmm4
9039 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
9040 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm8, %zmm10
9041 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1}
9042 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
9043 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm8, %zmm4
9044 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm8
9045 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
9046 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19
9047 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
9048 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9049 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
9050 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
9051 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
9052 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
9053 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
9054 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
9055 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
9056 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
9057 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
9058 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
9059 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20
9060 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
9061 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
9062 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
9063 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
9064 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
9065 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
9066 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
9067 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
9068 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
9069 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21
9070 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
9071 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9072 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
9073 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
9074 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
9075 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
9076 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
9077 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
9078 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
9079 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
9080 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
9081 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
9082 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22
9083 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
9084 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
9085 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
9086 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
9087 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
9088 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
9089 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
9090 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
9091 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
9092 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23
9093 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
9094 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9095 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
9096 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
9097 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
9098 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
9099 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
9100 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
9101 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
9102 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
9103 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
9104 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
9105 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24
9106 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
9107 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
9108 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
9109 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
9110 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
9111 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
9112 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
9113 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
9114 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
9115 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25
9116 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
9117 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9118 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
9119 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
9120 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
9121 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
9122 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
9123 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
9124 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
9125 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
9126 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
9127 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
9128 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26
9129 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
9130 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm8
9131 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
9132 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm10
9133 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
9134 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
9135 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm8
9136 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
9137 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
9138 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27
9139 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
9140 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9141 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
9142 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm4, %zmm8
9143 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
9144 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm4, %zmm10
9145 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
9146 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
9147 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm4, %zmm8
9148 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
9149 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm4, %zmm13
9150 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
9151 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
9152 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10
9153 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm4, %zmm10
9154 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
9155 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm4, %zmm13
9156 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
9157 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
9158 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm4, %zmm10
9159 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm29, %zmm0, %zmm4
9160 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
9161 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4
9162 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
9163 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
9164 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm10, %zmm15
9165 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm10, %zmm11
9166 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1}
9167 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm12, %zmm10, %zmm2
9168 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm10, %zmm5
9169 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
9170 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
9171 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm10, %zmm6
9172 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm10, %zmm3
9173 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
9174 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm10, %zmm1
9175 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm10, %zmm0
9176 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9177 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
9178 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 64(%rsi)
9179 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%rsi)
9180 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rdx)
9181 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx)
9182 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rcx)
9183 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, (%rcx)
9184 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8)
9185 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, (%r8)
9186 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%r9)
9187 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, (%r9)
9188 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%r11)
9189 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, (%r11)
9190 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%r10)
9191 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10)
9192 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax)
9193 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
9194 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
9195 ; AVX512DQ-BW-FCP-NEXT: retq
9196 %wide.vec = load <256 x i32>, ptr %in.vec, align 64
9197 %strided.vec0 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120, i32 128, i32 136, i32 144, i32 152, i32 160, i32 168, i32 176, i32 184, i32 192, i32 200, i32 208, i32 216, i32 224, i32 232, i32 240, i32 248>
9198 %strided.vec1 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121, i32 129, i32 137, i32 145, i32 153, i32 161, i32 169, i32 177, i32 185, i32 193, i32 201, i32 209, i32 217, i32 225, i32 233, i32 241, i32 249>
9199 %strided.vec2 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 66, i32 74, i32 82, i32 90, i32 98, i32 106, i32 114, i32 122, i32 130, i32 138, i32 146, i32 154, i32 162, i32 170, i32 178, i32 186, i32 194, i32 202, i32 210, i32 218, i32 226, i32 234, i32 242, i32 250>
9200 %strided.vec3 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 67, i32 75, i32 83, i32 91, i32 99, i32 107, i32 115, i32 123, i32 131, i32 139, i32 147, i32 155, i32 163, i32 171, i32 179, i32 187, i32 195, i32 203, i32 211, i32 219, i32 227, i32 235, i32 243, i32 251>
9201 %strided.vec4 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 68, i32 76, i32 84, i32 92, i32 100, i32 108, i32 116, i32 124, i32 132, i32 140, i32 148, i32 156, i32 164, i32 172, i32 180, i32 188, i32 196, i32 204, i32 212, i32 220, i32 228, i32 236, i32 244, i32 252>
9202 %strided.vec5 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125, i32 133, i32 141, i32 149, i32 157, i32 165, i32 173, i32 181, i32 189, i32 197, i32 205, i32 213, i32 221, i32 229, i32 237, i32 245, i32 253>
9203 %strided.vec6 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126, i32 134, i32 142, i32 150, i32 158, i32 166, i32 174, i32 182, i32 190, i32 198, i32 206, i32 214, i32 222, i32 230, i32 238, i32 246, i32 254>
9204 %strided.vec7 = shufflevector <256 x i32> %wide.vec, <256 x i32> poison, <32 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127, i32 135, i32 143, i32 151, i32 159, i32 167, i32 175, i32 183, i32 191, i32 199, i32 207, i32 215, i32 223, i32 231, i32 239, i32 247, i32 255>
9205 store <32 x i32> %strided.vec0, ptr %out.vec0, align 64
9206 store <32 x i32> %strided.vec1, ptr %out.vec1, align 64
9207 store <32 x i32> %strided.vec2, ptr %out.vec2, align 64
9208 store <32 x i32> %strided.vec3, ptr %out.vec3, align 64
9209 store <32 x i32> %strided.vec4, ptr %out.vec4, align 64
9210 store <32 x i32> %strided.vec5, ptr %out.vec5, align 64
9211 store <32 x i32> %strided.vec6, ptr %out.vec6, align 64
9212 store <32 x i32> %strided.vec7, ptr %out.vec7, align 64
9216 define void @load_i32_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
9217 ; SSE-LABEL: load_i32_stride8_vf64:
9219 ; SSE-NEXT: subq $2232, %rsp # imm = 0x8B8
9220 ; SSE-NEXT: movaps 288(%rdi), %xmm4
9221 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9222 ; SSE-NEXT: movaps 352(%rdi), %xmm5
9223 ; SSE-NEXT: movaps %xmm5, (%rsp) # 16-byte Spill
9224 ; SSE-NEXT: movaps 320(%rdi), %xmm6
9225 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9226 ; SSE-NEXT: movaps 416(%rdi), %xmm7
9227 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9228 ; SSE-NEXT: movaps 384(%rdi), %xmm8
9229 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9230 ; SSE-NEXT: movaps 480(%rdi), %xmm9
9231 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9232 ; SSE-NEXT: movaps 448(%rdi), %xmm3
9233 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9234 ; SSE-NEXT: movaps 160(%rdi), %xmm10
9235 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9236 ; SSE-NEXT: movaps 128(%rdi), %xmm1
9237 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9238 ; SSE-NEXT: movaps 224(%rdi), %xmm2
9239 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9240 ; SSE-NEXT: movaps 192(%rdi), %xmm0
9241 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9242 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
9243 ; SSE-NEXT: movaps %xmm1, %xmm2
9244 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm10[0],xmm2[1],xmm10[1]
9245 ; SSE-NEXT: movaps %xmm2, %xmm1
9246 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
9247 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9248 ; SSE-NEXT: movaps %xmm3, %xmm1
9249 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
9250 ; SSE-NEXT: movaps %xmm8, %xmm3
9251 ; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1]
9252 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
9253 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9254 ; SSE-NEXT: movaps %xmm3, %xmm0
9255 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
9256 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9257 ; SSE-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
9258 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9259 ; SSE-NEXT: movaps %xmm6, %xmm0
9260 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
9261 ; SSE-NEXT: movaps 256(%rdi), %xmm2
9262 ; SSE-NEXT: movaps %xmm2, %xmm1
9263 ; SSE-NEXT: movaps %xmm2, %xmm3
9264 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
9265 ; SSE-NEXT: movaps %xmm1, %xmm2
9266 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9267 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9268 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9269 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9270 ; SSE-NEXT: movaps 736(%rdi), %xmm9
9271 ; SSE-NEXT: movaps 704(%rdi), %xmm0
9272 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9273 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
9274 ; SSE-NEXT: movaps 672(%rdi), %xmm2
9275 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9276 ; SSE-NEXT: movaps 640(%rdi), %xmm1
9277 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9278 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9279 ; SSE-NEXT: movaps %xmm1, %xmm2
9280 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9281 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9282 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9283 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9284 ; SSE-NEXT: movaps 608(%rdi), %xmm2
9285 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9286 ; SSE-NEXT: movaps 576(%rdi), %xmm1
9287 ; SSE-NEXT: movaps %xmm1, %xmm0
9288 ; SSE-NEXT: movaps %xmm1, %xmm4
9289 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
9290 ; SSE-NEXT: movaps 544(%rdi), %xmm15
9291 ; SSE-NEXT: movaps 512(%rdi), %xmm2
9292 ; SSE-NEXT: movaps %xmm2, %xmm1
9293 ; SSE-NEXT: movaps %xmm2, %xmm6
9294 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1]
9295 ; SSE-NEXT: movaps %xmm1, %xmm2
9296 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9297 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9298 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9299 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9300 ; SSE-NEXT: movaps 992(%rdi), %xmm1
9301 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9302 ; SSE-NEXT: movaps 960(%rdi), %xmm0
9303 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9304 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9305 ; SSE-NEXT: movaps 928(%rdi), %xmm2
9306 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9307 ; SSE-NEXT: movaps 896(%rdi), %xmm1
9308 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9309 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9310 ; SSE-NEXT: movaps %xmm1, %xmm2
9311 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9312 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9313 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9314 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9315 ; SSE-NEXT: movaps 864(%rdi), %xmm1
9316 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9317 ; SSE-NEXT: movaps 832(%rdi), %xmm0
9318 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9319 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9320 ; SSE-NEXT: movaps 800(%rdi), %xmm14
9321 ; SSE-NEXT: movaps 768(%rdi), %xmm2
9322 ; SSE-NEXT: movaps %xmm2, %xmm1
9323 ; SSE-NEXT: movaps %xmm2, %xmm8
9324 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1]
9325 ; SSE-NEXT: movaps %xmm1, %xmm2
9326 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9327 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9328 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9329 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9330 ; SSE-NEXT: movaps 1248(%rdi), %xmm1
9331 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9332 ; SSE-NEXT: movaps 1216(%rdi), %xmm0
9333 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9334 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9335 ; SSE-NEXT: movaps 1184(%rdi), %xmm2
9336 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9337 ; SSE-NEXT: movaps 1152(%rdi), %xmm1
9338 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9339 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9340 ; SSE-NEXT: movaps %xmm1, %xmm2
9341 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9342 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9343 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9344 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9345 ; SSE-NEXT: movaps 1120(%rdi), %xmm2
9346 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9347 ; SSE-NEXT: movaps 1088(%rdi), %xmm1
9348 ; SSE-NEXT: movaps %xmm1, %xmm0
9349 ; SSE-NEXT: movaps %xmm1, %xmm7
9350 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
9351 ; SSE-NEXT: movaps 1056(%rdi), %xmm2
9352 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9353 ; SSE-NEXT: movaps 1024(%rdi), %xmm1
9354 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9355 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9356 ; SSE-NEXT: movaps %xmm1, %xmm2
9357 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9358 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9359 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9360 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9361 ; SSE-NEXT: movaps 1504(%rdi), %xmm1
9362 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9363 ; SSE-NEXT: movaps 1472(%rdi), %xmm0
9364 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9365 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9366 ; SSE-NEXT: movaps 1440(%rdi), %xmm2
9367 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9368 ; SSE-NEXT: movaps 1408(%rdi), %xmm1
9369 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9370 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9371 ; SSE-NEXT: movaps %xmm1, %xmm2
9372 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9373 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9374 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9375 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9376 ; SSE-NEXT: movaps 1376(%rdi), %xmm1
9377 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9378 ; SSE-NEXT: movaps 1344(%rdi), %xmm0
9379 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9380 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9381 ; SSE-NEXT: movaps 1312(%rdi), %xmm2
9382 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9383 ; SSE-NEXT: movaps 1280(%rdi), %xmm1
9384 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9385 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9386 ; SSE-NEXT: movaps %xmm1, %xmm2
9387 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9388 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9389 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9390 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9391 ; SSE-NEXT: movaps 1760(%rdi), %xmm1
9392 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9393 ; SSE-NEXT: movaps 1728(%rdi), %xmm0
9394 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9395 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9396 ; SSE-NEXT: movaps 1696(%rdi), %xmm2
9397 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9398 ; SSE-NEXT: movaps 1664(%rdi), %xmm1
9399 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9400 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9401 ; SSE-NEXT: movaps %xmm1, %xmm2
9402 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9403 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9404 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9405 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9406 ; SSE-NEXT: movaps 1632(%rdi), %xmm1
9407 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9408 ; SSE-NEXT: movaps 1600(%rdi), %xmm0
9409 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9410 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9411 ; SSE-NEXT: movaps 1568(%rdi), %xmm5
9412 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9413 ; SSE-NEXT: movaps 1536(%rdi), %xmm2
9414 ; SSE-NEXT: movaps %xmm2, %xmm1
9415 ; SSE-NEXT: movaps %xmm2, %xmm13
9416 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
9417 ; SSE-NEXT: movaps %xmm1, %xmm2
9418 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9419 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9420 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9421 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9422 ; SSE-NEXT: movaps 2016(%rdi), %xmm1
9423 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9424 ; SSE-NEXT: movaps 1984(%rdi), %xmm0
9425 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9426 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9427 ; SSE-NEXT: movaps 1952(%rdi), %xmm2
9428 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9429 ; SSE-NEXT: movaps 1920(%rdi), %xmm1
9430 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9431 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9432 ; SSE-NEXT: movaps %xmm1, %xmm2
9433 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9434 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9435 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9436 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9437 ; SSE-NEXT: movaps 1888(%rdi), %xmm1
9438 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9439 ; SSE-NEXT: movaps 1856(%rdi), %xmm0
9440 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9441 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9442 ; SSE-NEXT: movaps 1824(%rdi), %xmm2
9443 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9444 ; SSE-NEXT: movaps 1792(%rdi), %xmm1
9445 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9446 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9447 ; SSE-NEXT: movaps %xmm1, %xmm5
9448 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0]
9449 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9450 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9451 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9452 ; SSE-NEXT: movaps 96(%rdi), %xmm2
9453 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9454 ; SSE-NEXT: movaps 64(%rdi), %xmm12
9455 ; SSE-NEXT: movaps %xmm12, %xmm0
9456 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
9457 ; SSE-NEXT: movaps (%rdi), %xmm10
9458 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9459 ; SSE-NEXT: movaps 32(%rdi), %xmm1
9460 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9461 ; SSE-NEXT: unpcklps {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1]
9462 ; SSE-NEXT: movaps %xmm10, %xmm5
9463 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm0[0]
9464 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9465 ; SSE-NEXT: unpckhpd {{.*#+}} xmm10 = xmm10[1],xmm0[1]
9466 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9467 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9468 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
9469 ; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3]
9470 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9471 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
9472 ; SSE-NEXT: # xmm5 = xmm5[2],mem[2],xmm5[3],mem[3]
9473 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9474 ; SSE-NEXT: unpckhps (%rsp), %xmm10 # 16-byte Folded Reload
9475 ; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3]
9476 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9477 ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
9478 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9479 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9480 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9481 ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
9482 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9483 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9484 ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9485 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9486 ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
9487 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9488 ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm15[2],xmm6[3],xmm15[3]
9489 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9490 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9491 ; SSE-NEXT: unpckhps {{.*#+}} xmm15 = xmm15[2],xmm9[2],xmm15[3],xmm9[3]
9492 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9493 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9494 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
9495 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9496 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9497 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
9498 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9499 ; SSE-NEXT: unpckhps {{.*#+}} xmm8 = xmm8[2],xmm14[2],xmm8[3],xmm14[3]
9500 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9501 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9502 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9503 ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3]
9504 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9505 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
9506 ; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3]
9507 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9508 ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
9509 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9510 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9511 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9512 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
9513 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9514 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9515 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
9516 ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3]
9517 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9518 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9519 ; SSE-NEXT: # xmm6 = xmm6[2],mem[2],xmm6[3],mem[3]
9520 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9521 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9522 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
9523 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9524 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9525 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9526 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
9527 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9528 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9529 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9530 ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
9531 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9532 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9533 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9534 ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
9535 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9536 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9537 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
9538 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9539 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
9540 ; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3]
9541 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9542 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9543 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9544 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
9545 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9546 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9547 ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
9548 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9549 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9550 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9551 ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
9552 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9553 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9554 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9555 ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
9556 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9557 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9558 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9559 ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
9560 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9561 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9562 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
9563 ; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3]
9564 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
9565 ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
9566 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9567 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9568 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
9569 ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
9570 ; SSE-NEXT: movaps %xmm5, %xmm7
9571 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm11[0]
9572 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9573 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm11[1]
9574 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9575 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9576 ; SSE-NEXT: movaps %xmm5, %xmm7
9577 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm10[0]
9578 ; SSE-NEXT: movaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9579 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm10[1]
9580 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9581 ; SSE-NEXT: movaps %xmm0, %xmm5
9582 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0]
9583 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9584 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
9585 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9586 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9587 ; SSE-NEXT: movaps %xmm0, %xmm5
9588 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9589 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm3[0]
9590 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9591 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
9592 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9593 ; SSE-NEXT: movaps %xmm1, %xmm5
9594 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm15[0]
9595 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9596 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm15[1]
9597 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9598 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9599 ; SSE-NEXT: movaps %xmm0, %xmm5
9600 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9601 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0]
9602 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9603 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
9604 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9605 ; SSE-NEXT: movaps %xmm9, %xmm5
9606 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm14[0]
9607 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9608 ; SSE-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm14[1]
9609 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9610 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9611 ; SSE-NEXT: movaps %xmm0, %xmm5
9612 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9613 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0]
9614 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9615 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
9616 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9617 ; SSE-NEXT: movaps %xmm6, %xmm5
9618 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm8[0]
9619 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9620 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm8[1]
9621 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9622 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9623 ; SSE-NEXT: movaps %xmm0, %xmm5
9624 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9625 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0]
9626 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9627 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
9628 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9629 ; SSE-NEXT: movaps %xmm4, %xmm5
9630 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9631 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0]
9632 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9633 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
9634 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9635 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9636 ; SSE-NEXT: movaps %xmm0, %xmm5
9637 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9638 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm1[0]
9639 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9640 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
9641 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9642 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9643 ; SSE-NEXT: movaps %xmm0, %xmm4
9644 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm2[0]
9645 ; SSE-NEXT: movaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9646 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
9647 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9648 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9649 ; SSE-NEXT: movaps %xmm6, %xmm0
9650 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9651 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
9652 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9653 ; SSE-NEXT: unpckhpd {{.*#+}} xmm6 = xmm6[1],xmm1[1]
9654 ; SSE-NEXT: movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9655 ; SSE-NEXT: movaps %xmm13, %xmm0
9656 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9657 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
9658 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9659 ; SSE-NEXT: unpckhpd {{.*#+}} xmm13 = xmm13[1],xmm1[1]
9660 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9661 ; SSE-NEXT: movaps %xmm12, %xmm0
9662 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9663 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
9664 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9665 ; SSE-NEXT: unpckhpd {{.*#+}} xmm12 = xmm12[1],xmm1[1]
9666 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9667 ; SSE-NEXT: movaps 240(%rdi), %xmm2
9668 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9669 ; SSE-NEXT: movaps 208(%rdi), %xmm7
9670 ; SSE-NEXT: movaps %xmm7, %xmm0
9671 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
9672 ; SSE-NEXT: movaps 176(%rdi), %xmm3
9673 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9674 ; SSE-NEXT: movaps 144(%rdi), %xmm1
9675 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9676 ; SSE-NEXT: movaps %xmm1, %xmm2
9677 ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
9678 ; SSE-NEXT: movaps %xmm2, %xmm1
9679 ; SSE-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
9680 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9681 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
9682 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9683 ; SSE-NEXT: movaps 368(%rdi), %xmm3
9684 ; SSE-NEXT: movaps 336(%rdi), %xmm0
9685 ; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
9686 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
9687 ; SSE-NEXT: movaps 304(%rdi), %xmm4
9688 ; SSE-NEXT: movaps 272(%rdi), %xmm1
9689 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9690 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
9691 ; SSE-NEXT: movaps %xmm1, %xmm2
9692 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9693 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9694 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9695 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9696 ; SSE-NEXT: movaps 496(%rdi), %xmm5
9697 ; SSE-NEXT: movaps 464(%rdi), %xmm0
9698 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9699 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
9700 ; SSE-NEXT: movaps 432(%rdi), %xmm6
9701 ; SSE-NEXT: movaps 400(%rdi), %xmm1
9702 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9703 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
9704 ; SSE-NEXT: movaps %xmm1, %xmm2
9705 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9706 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9707 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9708 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9709 ; SSE-NEXT: movaps 624(%rdi), %xmm9
9710 ; SSE-NEXT: movaps 592(%rdi), %xmm0
9711 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9712 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
9713 ; SSE-NEXT: movaps 560(%rdi), %xmm10
9714 ; SSE-NEXT: movaps 528(%rdi), %xmm1
9715 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9716 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm10[0],xmm1[1],xmm10[1]
9717 ; SSE-NEXT: movaps %xmm1, %xmm2
9718 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9719 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9720 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9721 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9722 ; SSE-NEXT: movaps 752(%rdi), %xmm12
9723 ; SSE-NEXT: movaps 720(%rdi), %xmm0
9724 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9725 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
9726 ; SSE-NEXT: movaps 688(%rdi), %xmm13
9727 ; SSE-NEXT: movaps 656(%rdi), %xmm2
9728 ; SSE-NEXT: movaps %xmm2, %xmm1
9729 ; SSE-NEXT: movaps %xmm2, %xmm15
9730 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1]
9731 ; SSE-NEXT: movaps %xmm1, %xmm2
9732 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9733 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9734 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9735 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9736 ; SSE-NEXT: movaps 880(%rdi), %xmm1
9737 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9738 ; SSE-NEXT: movaps 848(%rdi), %xmm0
9739 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9740 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9741 ; SSE-NEXT: movaps 816(%rdi), %xmm2
9742 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9743 ; SSE-NEXT: movaps 784(%rdi), %xmm1
9744 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9745 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9746 ; SSE-NEXT: movaps %xmm1, %xmm2
9747 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9748 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9749 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9750 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9751 ; SSE-NEXT: movaps 1008(%rdi), %xmm14
9752 ; SSE-NEXT: movaps 976(%rdi), %xmm0
9753 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9754 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1]
9755 ; SSE-NEXT: movaps 944(%rdi), %xmm2
9756 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9757 ; SSE-NEXT: movaps 912(%rdi), %xmm1
9758 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9759 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9760 ; SSE-NEXT: movaps %xmm1, %xmm2
9761 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9762 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9763 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9764 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9765 ; SSE-NEXT: movaps 1136(%rdi), %xmm1
9766 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9767 ; SSE-NEXT: movaps 1104(%rdi), %xmm0
9768 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9769 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9770 ; SSE-NEXT: movaps 1072(%rdi), %xmm2
9771 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9772 ; SSE-NEXT: movaps 1040(%rdi), %xmm1
9773 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9774 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9775 ; SSE-NEXT: movaps %xmm1, %xmm2
9776 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9777 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9778 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9779 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9780 ; SSE-NEXT: movaps 1264(%rdi), %xmm11
9781 ; SSE-NEXT: movaps 1232(%rdi), %xmm0
9782 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9783 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
9784 ; SSE-NEXT: movaps 1200(%rdi), %xmm2
9785 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9786 ; SSE-NEXT: movaps 1168(%rdi), %xmm1
9787 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9788 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9789 ; SSE-NEXT: movaps %xmm1, %xmm2
9790 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9791 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9792 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9793 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9794 ; SSE-NEXT: movaps 1392(%rdi), %xmm1
9795 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9796 ; SSE-NEXT: movaps 1360(%rdi), %xmm0
9797 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9798 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9799 ; SSE-NEXT: movaps 1328(%rdi), %xmm2
9800 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9801 ; SSE-NEXT: movaps 1296(%rdi), %xmm1
9802 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9803 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9804 ; SSE-NEXT: movaps %xmm1, %xmm2
9805 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9806 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9807 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9808 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9809 ; SSE-NEXT: movaps 1520(%rdi), %xmm1
9810 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9811 ; SSE-NEXT: movaps 1488(%rdi), %xmm0
9812 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9813 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9814 ; SSE-NEXT: movaps 1456(%rdi), %xmm2
9815 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9816 ; SSE-NEXT: movaps 1424(%rdi), %xmm1
9817 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9818 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9819 ; SSE-NEXT: movaps %xmm1, %xmm2
9820 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9821 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9822 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9823 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9824 ; SSE-NEXT: movaps 1648(%rdi), %xmm1
9825 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9826 ; SSE-NEXT: movaps 1616(%rdi), %xmm0
9827 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9828 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9829 ; SSE-NEXT: movaps 1584(%rdi), %xmm2
9830 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9831 ; SSE-NEXT: movaps 1552(%rdi), %xmm1
9832 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9833 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9834 ; SSE-NEXT: movaps %xmm1, %xmm2
9835 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9836 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9837 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9838 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9839 ; SSE-NEXT: movaps 1776(%rdi), %xmm8
9840 ; SSE-NEXT: movaps 1744(%rdi), %xmm0
9841 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9842 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
9843 ; SSE-NEXT: movaps 1712(%rdi), %xmm2
9844 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9845 ; SSE-NEXT: movaps 1680(%rdi), %xmm1
9846 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9847 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9848 ; SSE-NEXT: movaps %xmm1, %xmm2
9849 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9850 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9851 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9852 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9853 ; SSE-NEXT: movaps 1904(%rdi), %xmm1
9854 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9855 ; SSE-NEXT: movaps 1872(%rdi), %xmm0
9856 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9857 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9858 ; SSE-NEXT: movaps 1840(%rdi), %xmm2
9859 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9860 ; SSE-NEXT: movaps 1808(%rdi), %xmm1
9861 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9862 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9863 ; SSE-NEXT: movaps %xmm1, %xmm2
9864 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9865 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9866 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9867 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9868 ; SSE-NEXT: movaps 2032(%rdi), %xmm1
9869 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9870 ; SSE-NEXT: movaps 2000(%rdi), %xmm0
9871 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9872 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9873 ; SSE-NEXT: movaps 1968(%rdi), %xmm2
9874 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9875 ; SSE-NEXT: movaps 1936(%rdi), %xmm1
9876 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9877 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9878 ; SSE-NEXT: movaps %xmm1, %xmm2
9879 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9880 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9881 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9882 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9883 ; SSE-NEXT: movaps 112(%rdi), %xmm1
9884 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9885 ; SSE-NEXT: movaps 80(%rdi), %xmm0
9886 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9887 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
9888 ; SSE-NEXT: movaps 16(%rdi), %xmm1
9889 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9890 ; SSE-NEXT: movaps 48(%rdi), %xmm2
9891 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9892 ; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
9893 ; SSE-NEXT: movaps %xmm1, %xmm2
9894 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
9895 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9896 ; SSE-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
9897 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9898 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9899 ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
9900 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9901 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9902 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
9903 ; SSE-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
9904 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
9905 ; SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
9906 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9907 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
9908 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9909 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9910 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
9911 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9912 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9913 ; SSE-NEXT: unpckhps {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
9914 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9915 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm9[2],xmm0[3],xmm9[3]
9916 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9917 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9918 ; SSE-NEXT: unpckhps {{.*#+}} xmm5 = xmm5[2],xmm10[2],xmm5[3],xmm10[3]
9919 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9920 ; SSE-NEXT: unpckhps {{.*#+}} xmm6 = xmm6[2],xmm12[2],xmm6[3],xmm12[3]
9921 ; SSE-NEXT: movaps %xmm15, %xmm4
9922 ; SSE-NEXT: unpckhps {{.*#+}} xmm4 = xmm4[2],xmm13[2],xmm4[3],xmm13[3]
9923 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9924 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9925 ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
9926 ; SSE-NEXT: movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9927 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9928 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9929 ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
9930 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9931 ; SSE-NEXT: unpckhps {{.*#+}} xmm3 = xmm3[2],xmm14[2],xmm3[3],xmm14[3]
9932 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9933 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9934 ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3]
9935 ; SSE-NEXT: movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9936 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9937 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9938 ; SSE-NEXT: # xmm14 = xmm14[2],mem[2],xmm14[3],mem[3]
9939 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9940 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
9941 ; SSE-NEXT: # xmm13 = xmm13[2],mem[2],xmm13[3],mem[3]
9942 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9943 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9944 ; SSE-NEXT: unpckhps {{.*#+}} xmm13 = xmm13[2],xmm11[2],xmm13[3],xmm11[3]
9945 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9946 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
9947 ; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3]
9948 ; SSE-NEXT: movaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9949 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9950 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
9951 ; SSE-NEXT: # xmm11 = xmm11[2],mem[2],xmm11[3],mem[3]
9952 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9953 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
9954 ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
9955 ; SSE-NEXT: movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9956 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9957 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
9958 ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
9959 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9960 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
9961 ; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3]
9962 ; SSE-NEXT: movaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9963 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9964 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
9965 ; SSE-NEXT: # xmm10 = xmm10[2],mem[2],xmm10[3],mem[3]
9966 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9967 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
9968 ; SSE-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3]
9969 ; SSE-NEXT: movaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9970 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9971 ; SSE-NEXT: unpckhps {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
9972 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9973 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
9974 ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3]
9975 ; SSE-NEXT: movaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9976 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9977 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
9978 ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3]
9979 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9980 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9981 ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9982 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9983 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9984 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9985 ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9986 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9987 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9988 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9989 ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9990 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9991 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9992 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9993 ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9994 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9995 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9996 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9997 ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9998 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9999 ; SSE-NEXT: movaps %xmm2, %xmm0
10000 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm7[0]
10001 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10002 ; SSE-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm7[1]
10003 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10004 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10005 ; SSE-NEXT: movaps %xmm0, %xmm2
10006 ; SSE-NEXT: movaps (%rsp), %xmm7 # 16-byte Reload
10007 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm7[0]
10008 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10009 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm7[1]
10010 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10011 ; SSE-NEXT: movaps %xmm1, %xmm0
10012 ; SSE-NEXT: movaps %xmm1, %xmm2
10013 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10014 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
10015 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10016 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
10017 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10018 ; SSE-NEXT: movaps %xmm5, %xmm2
10019 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10020 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
10021 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10022 ; SSE-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
10023 ; SSE-NEXT: movaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10024 ; SSE-NEXT: movaps %xmm4, %xmm2
10025 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm6[0]
10026 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10027 ; SSE-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm6[1]
10028 ; SSE-NEXT: movaps %xmm4, (%rsp) # 16-byte Spill
10029 ; SSE-NEXT: movaps %xmm15, %xmm0
10030 ; SSE-NEXT: movaps %xmm15, %xmm2
10031 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10032 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm1[0]
10033 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10034 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
10035 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10036 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10037 ; SSE-NEXT: movaps %xmm0, %xmm15
10038 ; SSE-NEXT: movlhps {{.*#+}} xmm15 = xmm15[0],xmm3[0]
10039 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm3[1]
10040 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10041 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10042 ; SSE-NEXT: movaps %xmm0, %xmm2
10043 ; SSE-NEXT: movaps %xmm14, %xmm1
10044 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm14[0]
10045 ; SSE-NEXT: movaps %xmm2, %xmm14
10046 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
10047 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10048 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10049 ; SSE-NEXT: movaps %xmm0, %xmm2
10050 ; SSE-NEXT: movaps %xmm13, %xmm1
10051 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm13[0]
10052 ; SSE-NEXT: movaps %xmm2, %xmm13
10053 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
10054 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10055 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10056 ; SSE-NEXT: movaps %xmm0, %xmm7
10057 ; SSE-NEXT: movlhps {{.*#+}} xmm7 = xmm7[0],xmm11[0]
10058 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm11[1]
10059 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10060 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10061 ; SSE-NEXT: movaps %xmm0, %xmm6
10062 ; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm12[0]
10063 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm12[1]
10064 ; SSE-NEXT: movaps %xmm0, %xmm12
10065 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10066 ; SSE-NEXT: movaps %xmm0, %xmm2
10067 ; SSE-NEXT: movlhps {{.*#+}} xmm2 = xmm2[0],xmm10[0]
10068 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm10[1]
10069 ; SSE-NEXT: movaps %xmm0, %xmm10
10070 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10071 ; SSE-NEXT: movaps %xmm0, %xmm5
10072 ; SSE-NEXT: movlhps {{.*#+}} xmm5 = xmm5[0],xmm9[0]
10073 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm9[1]
10074 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10075 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10076 ; SSE-NEXT: movaps %xmm0, %xmm4
10077 ; SSE-NEXT: movlhps {{.*#+}} xmm4 = xmm4[0],xmm8[0]
10078 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm8[1]
10079 ; SSE-NEXT: movaps %xmm0, %xmm8
10080 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10081 ; SSE-NEXT: movaps %xmm0, %xmm3
10082 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10083 ; SSE-NEXT: movlhps {{.*#+}} xmm3 = xmm3[0],xmm1[0]
10084 ; SSE-NEXT: unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
10085 ; SSE-NEXT: movaps %xmm0, %xmm9
10086 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10087 ; SSE-NEXT: movaps %xmm11, %xmm0
10088 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10089 ; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
10090 ; SSE-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm1[1]
10091 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10092 ; SSE-NEXT: movaps %xmm1, 224(%rsi)
10093 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10094 ; SSE-NEXT: movaps %xmm1, 160(%rsi)
10095 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10096 ; SSE-NEXT: movaps %xmm1, 96(%rsi)
10097 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10098 ; SSE-NEXT: movaps %xmm1, 32(%rsi)
10099 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10100 ; SSE-NEXT: movaps %xmm1, 240(%rsi)
10101 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10102 ; SSE-NEXT: movaps %xmm1, 176(%rsi)
10103 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10104 ; SSE-NEXT: movaps %xmm1, 112(%rsi)
10105 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10106 ; SSE-NEXT: movaps %xmm1, 48(%rsi)
10107 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10108 ; SSE-NEXT: movaps %xmm1, 192(%rsi)
10109 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10110 ; SSE-NEXT: movaps %xmm1, 128(%rsi)
10111 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10112 ; SSE-NEXT: movaps %xmm1, 64(%rsi)
10113 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10114 ; SSE-NEXT: movaps %xmm1, (%rsi)
10115 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10116 ; SSE-NEXT: movaps %xmm1, 208(%rsi)
10117 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10118 ; SSE-NEXT: movaps %xmm1, 144(%rsi)
10119 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10120 ; SSE-NEXT: movaps %xmm1, 80(%rsi)
10121 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10122 ; SSE-NEXT: movaps %xmm1, 16(%rsi)
10123 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10124 ; SSE-NEXT: movaps %xmm1, 224(%rdx)
10125 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10126 ; SSE-NEXT: movaps %xmm1, 240(%rdx)
10127 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10128 ; SSE-NEXT: movaps %xmm1, 192(%rdx)
10129 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10130 ; SSE-NEXT: movaps %xmm1, 208(%rdx)
10131 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10132 ; SSE-NEXT: movaps %xmm1, 160(%rdx)
10133 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10134 ; SSE-NEXT: movaps %xmm1, 176(%rdx)
10135 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10136 ; SSE-NEXT: movaps %xmm1, 128(%rdx)
10137 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10138 ; SSE-NEXT: movaps %xmm1, 144(%rdx)
10139 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10140 ; SSE-NEXT: movaps %xmm1, 96(%rdx)
10141 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10142 ; SSE-NEXT: movaps %xmm1, 112(%rdx)
10143 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10144 ; SSE-NEXT: movaps %xmm1, 64(%rdx)
10145 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10146 ; SSE-NEXT: movaps %xmm1, 80(%rdx)
10147 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10148 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
10149 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10150 ; SSE-NEXT: movaps %xmm1, 48(%rdx)
10151 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10152 ; SSE-NEXT: movaps %xmm1, (%rdx)
10153 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10154 ; SSE-NEXT: movaps %xmm1, 16(%rdx)
10155 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10156 ; SSE-NEXT: movaps %xmm1, 240(%rcx)
10157 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10158 ; SSE-NEXT: movaps %xmm1, 224(%rcx)
10159 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10160 ; SSE-NEXT: movaps %xmm1, 208(%rcx)
10161 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10162 ; SSE-NEXT: movaps %xmm1, 192(%rcx)
10163 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10164 ; SSE-NEXT: movaps %xmm1, 176(%rcx)
10165 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10166 ; SSE-NEXT: movaps %xmm1, 160(%rcx)
10167 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10168 ; SSE-NEXT: movaps %xmm1, 144(%rcx)
10169 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10170 ; SSE-NEXT: movaps %xmm1, 128(%rcx)
10171 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10172 ; SSE-NEXT: movaps %xmm1, 112(%rcx)
10173 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10174 ; SSE-NEXT: movaps %xmm1, 96(%rcx)
10175 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10176 ; SSE-NEXT: movaps %xmm1, 80(%rcx)
10177 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10178 ; SSE-NEXT: movaps %xmm1, 64(%rcx)
10179 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10180 ; SSE-NEXT: movaps %xmm1, 48(%rcx)
10181 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10182 ; SSE-NEXT: movaps %xmm1, 32(%rcx)
10183 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10184 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
10185 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10186 ; SSE-NEXT: movaps %xmm1, (%rcx)
10187 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10188 ; SSE-NEXT: movaps %xmm1, 240(%r8)
10189 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10190 ; SSE-NEXT: movaps %xmm1, 224(%r8)
10191 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10192 ; SSE-NEXT: movaps %xmm1, 208(%r8)
10193 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10194 ; SSE-NEXT: movaps %xmm1, 192(%r8)
10195 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10196 ; SSE-NEXT: movaps %xmm1, 176(%r8)
10197 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10198 ; SSE-NEXT: movaps %xmm1, 160(%r8)
10199 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10200 ; SSE-NEXT: movaps %xmm1, 144(%r8)
10201 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10202 ; SSE-NEXT: movaps %xmm1, 128(%r8)
10203 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10204 ; SSE-NEXT: movaps %xmm1, 112(%r8)
10205 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10206 ; SSE-NEXT: movaps %xmm1, 96(%r8)
10207 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10208 ; SSE-NEXT: movaps %xmm1, 80(%r8)
10209 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10210 ; SSE-NEXT: movaps %xmm1, 64(%r8)
10211 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10212 ; SSE-NEXT: movaps %xmm1, 48(%r8)
10213 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10214 ; SSE-NEXT: movaps %xmm1, 32(%r8)
10215 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10216 ; SSE-NEXT: movaps %xmm1, 16(%r8)
10217 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10218 ; SSE-NEXT: movaps %xmm1, (%r8)
10219 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10220 ; SSE-NEXT: movaps %xmm1, 240(%r9)
10221 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10222 ; SSE-NEXT: movaps %xmm1, 224(%r9)
10223 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10224 ; SSE-NEXT: movaps %xmm1, 208(%r9)
10225 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10226 ; SSE-NEXT: movaps %xmm1, 192(%r9)
10227 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10228 ; SSE-NEXT: movaps %xmm1, 176(%r9)
10229 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10230 ; SSE-NEXT: movaps %xmm1, 160(%r9)
10231 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10232 ; SSE-NEXT: movaps %xmm1, 144(%r9)
10233 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10234 ; SSE-NEXT: movaps %xmm1, 128(%r9)
10235 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10236 ; SSE-NEXT: movaps %xmm1, 112(%r9)
10237 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10238 ; SSE-NEXT: movaps %xmm1, 96(%r9)
10239 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10240 ; SSE-NEXT: movaps %xmm1, 80(%r9)
10241 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10242 ; SSE-NEXT: movaps %xmm1, 64(%r9)
10243 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10244 ; SSE-NEXT: movaps %xmm1, 48(%r9)
10245 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10246 ; SSE-NEXT: movaps %xmm1, 32(%r9)
10247 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10248 ; SSE-NEXT: movaps %xmm1, 16(%r9)
10249 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10250 ; SSE-NEXT: movaps %xmm1, (%r9)
10251 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
10252 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10253 ; SSE-NEXT: movaps %xmm1, 240(%rax)
10254 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10255 ; SSE-NEXT: movaps %xmm1, 224(%rax)
10256 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10257 ; SSE-NEXT: movaps %xmm1, 208(%rax)
10258 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10259 ; SSE-NEXT: movaps %xmm1, 192(%rax)
10260 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10261 ; SSE-NEXT: movaps %xmm1, 176(%rax)
10262 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10263 ; SSE-NEXT: movaps %xmm1, 160(%rax)
10264 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10265 ; SSE-NEXT: movaps %xmm1, 144(%rax)
10266 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10267 ; SSE-NEXT: movaps %xmm1, 128(%rax)
10268 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10269 ; SSE-NEXT: movaps %xmm1, 112(%rax)
10270 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10271 ; SSE-NEXT: movaps %xmm1, 96(%rax)
10272 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10273 ; SSE-NEXT: movaps %xmm1, 80(%rax)
10274 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10275 ; SSE-NEXT: movaps %xmm1, 64(%rax)
10276 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10277 ; SSE-NEXT: movaps %xmm1, 48(%rax)
10278 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10279 ; SSE-NEXT: movaps %xmm1, 32(%rax)
10280 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10281 ; SSE-NEXT: movaps %xmm1, 16(%rax)
10282 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10283 ; SSE-NEXT: movaps %xmm1, (%rax)
10284 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
10285 ; SSE-NEXT: movaps %xmm3, 240(%rax)
10286 ; SSE-NEXT: movaps %xmm4, 224(%rax)
10287 ; SSE-NEXT: movaps %xmm5, 208(%rax)
10288 ; SSE-NEXT: movaps %xmm2, 192(%rax)
10289 ; SSE-NEXT: movaps %xmm6, 176(%rax)
10290 ; SSE-NEXT: movaps %xmm7, 160(%rax)
10291 ; SSE-NEXT: movaps %xmm13, 144(%rax)
10292 ; SSE-NEXT: movaps %xmm14, 128(%rax)
10293 ; SSE-NEXT: movaps %xmm15, 112(%rax)
10294 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10295 ; SSE-NEXT: movaps %xmm1, 96(%rax)
10296 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10297 ; SSE-NEXT: movaps %xmm1, 80(%rax)
10298 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10299 ; SSE-NEXT: movaps %xmm1, 64(%rax)
10300 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10301 ; SSE-NEXT: movaps %xmm1, 48(%rax)
10302 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10303 ; SSE-NEXT: movaps %xmm1, 32(%rax)
10304 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10305 ; SSE-NEXT: movaps %xmm1, 16(%rax)
10306 ; SSE-NEXT: movaps %xmm0, (%rax)
10307 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
10308 ; SSE-NEXT: movaps %xmm9, 240(%rax)
10309 ; SSE-NEXT: movaps %xmm8, 224(%rax)
10310 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10311 ; SSE-NEXT: movaps %xmm0, 208(%rax)
10312 ; SSE-NEXT: movaps %xmm10, 192(%rax)
10313 ; SSE-NEXT: movaps %xmm12, 176(%rax)
10314 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10315 ; SSE-NEXT: movaps %xmm0, 160(%rax)
10316 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10317 ; SSE-NEXT: movaps %xmm0, 144(%rax)
10318 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10319 ; SSE-NEXT: movaps %xmm0, 128(%rax)
10320 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10321 ; SSE-NEXT: movaps %xmm0, 112(%rax)
10322 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10323 ; SSE-NEXT: movaps %xmm0, 96(%rax)
10324 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
10325 ; SSE-NEXT: movaps %xmm0, 80(%rax)
10326 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10327 ; SSE-NEXT: movaps %xmm0, 64(%rax)
10328 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10329 ; SSE-NEXT: movaps %xmm0, 48(%rax)
10330 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10331 ; SSE-NEXT: movaps %xmm0, 32(%rax)
10332 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10333 ; SSE-NEXT: movaps %xmm0, 16(%rax)
10334 ; SSE-NEXT: movaps %xmm11, (%rax)
10335 ; SSE-NEXT: addq $2232, %rsp # imm = 0x8B8
10338 ; AVX-LABEL: load_i32_stride8_vf64:
10340 ; AVX-NEXT: subq $3720, %rsp # imm = 0xE88
10341 ; AVX-NEXT: vmovaps 288(%rdi), %xmm13
10342 ; AVX-NEXT: vmovaps 256(%rdi), %xmm2
10343 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm2[0],xmm13[0],xmm2[1],xmm13[1]
10344 ; AVX-NEXT: vmovaps %xmm2, %xmm15
10345 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10346 ; AVX-NEXT: vmovaps 352(%rdi), %xmm1
10347 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10348 ; AVX-NEXT: vmovaps 320(%rdi), %xmm2
10349 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10350 ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
10351 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10352 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
10353 ; AVX-NEXT: vmovaps 416(%rdi), %xmm1
10354 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10355 ; AVX-NEXT: vmovaps 384(%rdi), %xmm2
10356 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10357 ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
10358 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
10359 ; AVX-NEXT: vmovaps 480(%rdi), %xmm2
10360 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10361 ; AVX-NEXT: vmovaps 448(%rdi), %xmm3
10362 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10363 ; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
10364 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10365 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1,0,1]
10366 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
10367 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
10368 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10369 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10370 ; AVX-NEXT: vmovaps 928(%rdi), %xmm1
10371 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10372 ; AVX-NEXT: vmovaps 896(%rdi), %xmm0
10373 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10374 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
10375 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
10376 ; AVX-NEXT: vmovaps 992(%rdi), %xmm1
10377 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10378 ; AVX-NEXT: vmovaps 960(%rdi), %xmm2
10379 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10380 ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
10381 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10382 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
10383 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
10384 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
10385 ; AVX-NEXT: vmovaps 800(%rdi), %xmm1
10386 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10387 ; AVX-NEXT: vmovaps 768(%rdi), %xmm14
10388 ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
10389 ; AVX-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10390 ; AVX-NEXT: vmovaps 864(%rdi), %xmm2
10391 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10392 ; AVX-NEXT: vmovaps 832(%rdi), %xmm3
10393 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10394 ; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
10395 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10396 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
10397 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10398 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10399 ; AVX-NEXT: vmovaps 1440(%rdi), %xmm1
10400 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10401 ; AVX-NEXT: vmovaps 1408(%rdi), %xmm0
10402 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10403 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
10404 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
10405 ; AVX-NEXT: vmovaps 1504(%rdi), %xmm1
10406 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10407 ; AVX-NEXT: vmovaps 1472(%rdi), %xmm2
10408 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10409 ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
10410 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10411 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
10412 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
10413 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
10414 ; AVX-NEXT: vmovaps 1312(%rdi), %xmm2
10415 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10416 ; AVX-NEXT: vmovaps 1280(%rdi), %xmm1
10417 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10418 ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
10419 ; AVX-NEXT: vmovaps 1376(%rdi), %xmm2
10420 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10421 ; AVX-NEXT: vmovaps 1344(%rdi), %xmm3
10422 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10423 ; AVX-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
10424 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10425 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm2[0]
10426 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10427 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10428 ; AVX-NEXT: vmovaps 1952(%rdi), %xmm0
10429 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10430 ; AVX-NEXT: vmovaps 1920(%rdi), %xmm1
10431 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10432 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10433 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
10434 ; AVX-NEXT: vmovaps 2016(%rdi), %xmm1
10435 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10436 ; AVX-NEXT: vmovaps 1984(%rdi), %xmm2
10437 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10438 ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
10439 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10440 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
10441 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
10442 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
10443 ; AVX-NEXT: vmovaps 1824(%rdi), %xmm1
10444 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10445 ; AVX-NEXT: vmovaps 1792(%rdi), %xmm5
10446 ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
10447 ; AVX-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10448 ; AVX-NEXT: vmovaps 1888(%rdi), %xmm2
10449 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10450 ; AVX-NEXT: vmovaps 1856(%rdi), %xmm3
10451 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10452 ; AVX-NEXT: vunpcklps {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
10453 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm10[0]
10454 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10455 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10456 ; AVX-NEXT: vmovaps 160(%rdi), %xmm0
10457 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10458 ; AVX-NEXT: vmovaps 128(%rdi), %xmm1
10459 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10460 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10461 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
10462 ; AVX-NEXT: vmovaps 224(%rdi), %xmm1
10463 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10464 ; AVX-NEXT: vmovaps 192(%rdi), %xmm2
10465 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10466 ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
10467 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10468 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,1,0,1]
10469 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
10470 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
10471 ; AVX-NEXT: vmovaps 32(%rdi), %xmm1
10472 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10473 ; AVX-NEXT: vmovaps (%rdi), %xmm4
10474 ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
10475 ; AVX-NEXT: vmovaps %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10476 ; AVX-NEXT: vmovaps 96(%rdi), %xmm2
10477 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10478 ; AVX-NEXT: vmovaps 64(%rdi), %xmm3
10479 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10480 ; AVX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
10481 ; AVX-NEXT: vmovlhps {{.*#+}} xmm1 = xmm1[0],xmm3[0]
10482 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10483 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10484 ; AVX-NEXT: vmovaps 672(%rdi), %xmm1
10485 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10486 ; AVX-NEXT: vmovaps 640(%rdi), %xmm0
10487 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10488 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
10489 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
10490 ; AVX-NEXT: vmovaps 736(%rdi), %xmm1
10491 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10492 ; AVX-NEXT: vmovaps 704(%rdi), %xmm2
10493 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10494 ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
10495 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10496 ; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm1[0,1,0,1]
10497 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
10498 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm6[6,7]
10499 ; AVX-NEXT: vmovaps 544(%rdi), %xmm0
10500 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10501 ; AVX-NEXT: vmovaps 512(%rdi), %xmm2
10502 ; AVX-NEXT: vunpcklps {{.*#+}} xmm7 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
10503 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10504 ; AVX-NEXT: vmovaps 608(%rdi), %xmm0
10505 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10506 ; AVX-NEXT: vmovaps 576(%rdi), %xmm1
10507 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10508 ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10509 ; AVX-NEXT: vmovlhps {{.*#+}} xmm7 = xmm7[0],xmm1[0]
10510 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7]
10511 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10512 ; AVX-NEXT: vmovaps 1184(%rdi), %xmm0
10513 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10514 ; AVX-NEXT: vmovaps 1152(%rdi), %xmm6
10515 ; AVX-NEXT: vmovaps %xmm6, (%rsp) # 16-byte Spill
10516 ; AVX-NEXT: vunpcklps {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
10517 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm7
10518 ; AVX-NEXT: vmovaps 1248(%rdi), %xmm0
10519 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10520 ; AVX-NEXT: vmovaps 1216(%rdi), %xmm6
10521 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10522 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
10523 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10524 ; AVX-NEXT: vshufps {{.*#+}} xmm8 = xmm0[0,1,0,1]
10525 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
10526 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm7[0,1,2,3,4,5],ymm8[6,7]
10527 ; AVX-NEXT: vmovaps 1056(%rdi), %xmm0
10528 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10529 ; AVX-NEXT: vmovaps 1024(%rdi), %xmm6
10530 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10531 ; AVX-NEXT: vunpcklps {{.*#+}} xmm9 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
10532 ; AVX-NEXT: vmovaps 1120(%rdi), %xmm0
10533 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10534 ; AVX-NEXT: vmovaps 1088(%rdi), %xmm6
10535 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10536 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
10537 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10538 ; AVX-NEXT: vmovlhps {{.*#+}} xmm9 = xmm9[0],xmm0[0]
10539 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3],ymm8[4,5,6,7]
10540 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10541 ; AVX-NEXT: vmovaps 1696(%rdi), %xmm0
10542 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10543 ; AVX-NEXT: vmovaps 1664(%rdi), %xmm6
10544 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10545 ; AVX-NEXT: vunpcklps {{.*#+}} xmm8 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
10546 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm9
10547 ; AVX-NEXT: vmovaps 1760(%rdi), %xmm0
10548 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10549 ; AVX-NEXT: vmovaps 1728(%rdi), %xmm6
10550 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10551 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
10552 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10553 ; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm0[0,1,0,1]
10554 ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
10555 ; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5],ymm11[6,7]
10556 ; AVX-NEXT: vmovaps 1568(%rdi), %xmm12
10557 ; AVX-NEXT: vmovaps 1536(%rdi), %xmm8
10558 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm8[0],xmm12[0],xmm8[1],xmm12[1]
10559 ; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10560 ; AVX-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10561 ; AVX-NEXT: vmovaps 1632(%rdi), %xmm6
10562 ; AVX-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10563 ; AVX-NEXT: vmovaps 1600(%rdi), %xmm7
10564 ; AVX-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10565 ; AVX-NEXT: vunpcklps {{.*#+}} xmm9 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
10566 ; AVX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm9[0]
10567 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
10568 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10569 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm15[1,1,1,1]
10570 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm13[1],xmm0[2,3]
10571 ; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10572 ; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
10573 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload
10574 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10575 ; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm7[1,1,1,1]
10576 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10577 ; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm6[1],xmm15[2,3]
10578 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
10579 ; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7]
10580 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
10581 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10582 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm14[1,1,1,1]
10583 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10584 ; AVX-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
10585 ; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10586 ; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
10587 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload
10588 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
10589 ; AVX-NEXT: # xmm15 = mem[1,1,1,1]
10590 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10591 ; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm14[1],xmm15[2,3]
10592 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
10593 ; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7]
10594 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
10595 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10596 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10597 ; AVX-NEXT: # xmm0 = mem[1,1,1,1]
10598 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10599 ; AVX-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
10600 ; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10601 ; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
10602 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 16-byte Folded Reload
10603 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
10604 ; AVX-NEXT: # xmm15 = mem[1,1,1,1]
10605 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
10606 ; AVX-NEXT: # xmm15 = xmm15[0],mem[1],xmm15[2,3]
10607 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
10608 ; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm15[0,1,2,3,4,5],ymm11[6,7]
10609 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm11[4,5,6,7]
10610 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10611 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1]
10612 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10613 ; AVX-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
10614 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3]
10615 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 16-byte Folded Reload
10616 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10617 ; AVX-NEXT: vshufps {{.*#+}} xmm10 = xmm11[1,1,1,1]
10618 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10619 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm15[1],xmm10[2,3]
10620 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
10621 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7]
10622 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
10623 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10624 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm4[1,1,1,1]
10625 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10626 ; AVX-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
10627 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
10628 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 16-byte Folded Reload
10629 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
10630 ; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm10[1,1,1,1]
10631 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
10632 ; AVX-NEXT: # xmm4 = xmm4[0],mem[1],xmm4[2,3]
10633 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
10634 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
10635 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
10636 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10637 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm2[1,1,1,1]
10638 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10639 ; AVX-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
10640 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
10641 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
10642 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
10643 ; AVX-NEXT: # xmm2 = mem[1,1,1,1]
10644 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
10645 ; AVX-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
10646 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
10647 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10648 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10649 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10650 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10651 ; AVX-NEXT: # xmm0 = mem[1,1,1,1]
10652 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10653 ; AVX-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
10654 ; AVX-NEXT: vblendps $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10655 ; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
10656 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
10657 ; AVX-NEXT: vpermilps $85, (%rsp), %xmm2 # 16-byte Folded Reload
10658 ; AVX-NEXT: # xmm2 = mem[1,1,1,1]
10659 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
10660 ; AVX-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
10661 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
10662 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10663 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10664 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10665 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm8[1,1,1,1]
10666 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3]
10667 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3]
10668 ; AVX-NEXT: vinsertf128 $1, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 16-byte Folded Reload
10669 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10670 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm9[1,1,1,1]
10671 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
10672 ; AVX-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
10673 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
10674 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10675 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10676 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10677 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10678 ; AVX-NEXT: vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm13[2],xmm0[3],xmm13[3]
10679 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10680 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
10681 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10682 ; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10683 ; AVX-NEXT: # xmm0 = mem[2,2,2,2]
10684 ; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10685 ; AVX-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
10686 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
10687 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
10688 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
10689 ; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10690 ; AVX-NEXT: # xmm1 = mem[2,2,2,2]
10691 ; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
10692 ; AVX-NEXT: # xmm1 = mem[0,1,2],xmm1[3]
10693 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
10694 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10695 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10696 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10697 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
10698 ; AVX-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
10699 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10700 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10701 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
10702 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10703 ; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10704 ; AVX-NEXT: # xmm0 = mem[2,2,2,2]
10705 ; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10706 ; AVX-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
10707 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
10708 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
10709 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
10710 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10711 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm7[2,2,2,2]
10712 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10713 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3]
10714 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
10715 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10716 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10717 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10718 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
10719 ; AVX-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
10720 ; AVX-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10721 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10722 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
10723 ; AVX-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
10724 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10725 ; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10726 ; AVX-NEXT: # xmm0 = mem[2,2,2,2]
10727 ; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10728 ; AVX-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
10729 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
10730 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
10731 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
10732 ; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10733 ; AVX-NEXT: # xmm1 = mem[2,2,2,2]
10734 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10735 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3]
10736 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
10737 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10738 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10739 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10740 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
10741 ; AVX-NEXT: # xmm3 = xmm0[2],mem[2],xmm0[3],mem[3]
10742 ; AVX-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10743 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm11[2],xmm15[2],xmm11[3],xmm15[3]
10744 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10745 ; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10746 ; AVX-NEXT: # xmm0 = mem[2,2,2,2]
10747 ; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10748 ; AVX-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
10749 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
10750 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
10751 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
10752 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10753 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm5[2,2,2,2]
10754 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10755 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
10756 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
10757 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10758 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10759 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10760 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
10761 ; AVX-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3]
10762 ; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10763 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload
10764 ; AVX-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3]
10765 ; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10766 ; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10767 ; AVX-NEXT: # xmm0 = mem[2,2,2,2]
10768 ; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10769 ; AVX-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
10770 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
10771 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
10772 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
10773 ; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10774 ; AVX-NEXT: # xmm1 = mem[2,2,2,2]
10775 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10776 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
10777 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
10778 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10779 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10780 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10781 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
10782 ; AVX-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3]
10783 ; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10784 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10785 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10786 ; AVX-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
10787 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10788 ; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
10789 ; AVX-NEXT: # xmm15 = mem[2,2,2,2]
10790 ; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
10791 ; AVX-NEXT: # xmm15 = mem[0,1,2],xmm15[3]
10792 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
10793 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm14
10794 ; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
10795 ; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
10796 ; AVX-NEXT: # xmm15 = mem[2,2,2,2]
10797 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10798 ; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3]
10799 ; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1],xmm15[2,3]
10800 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm15[0,1,2,3],ymm14[4,5,6,7]
10801 ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10802 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10803 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload
10804 ; AVX-NEXT: # xmm0 = xmm8[2],mem[2],xmm8[3],mem[3]
10805 ; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10806 ; AVX-NEXT: vmovaps (%rsp), %xmm8 # 16-byte Reload
10807 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm8 # 16-byte Folded Reload
10808 ; AVX-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3]
10809 ; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10810 ; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
10811 ; AVX-NEXT: # xmm13 = mem[2,2,2,2]
10812 ; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
10813 ; AVX-NEXT: # xmm13 = mem[0,1,2],xmm13[3]
10814 ; AVX-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13
10815 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm12
10816 ; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm13[6,7]
10817 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10818 ; AVX-NEXT: vshufps {{.*#+}} xmm13 = xmm15[2,2,2,2]
10819 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10820 ; AVX-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1,2],xmm13[3]
10821 ; AVX-NEXT: vblendps {{.*#+}} xmm13 = xmm0[0,1],xmm13[2,3]
10822 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm12[4,5,6,7]
10823 ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10824 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10825 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm0 # 16-byte Folded Reload
10826 ; AVX-NEXT: # xmm0 = xmm8[2],mem[2],xmm8[3],mem[3]
10827 ; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill
10828 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm8 # 16-byte Folded Reload
10829 ; AVX-NEXT: # xmm8 = xmm9[2],mem[2],xmm9[3],mem[3]
10830 ; AVX-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10831 ; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
10832 ; AVX-NEXT: # xmm11 = mem[2,2,2,2]
10833 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
10834 ; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm13[0,1,2],xmm11[3]
10835 ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
10836 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm9
10837 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7]
10838 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
10839 ; AVX-NEXT: vshufps {{.*#+}} xmm11 = xmm12[2,2,2,2]
10840 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
10841 ; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm10[0,1,2],xmm11[3]
10842 ; AVX-NEXT: vblendps {{.*#+}} xmm11 = xmm0[0,1],xmm11[2,3]
10843 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm11[0,1,2,3],ymm9[4,5,6,7]
10844 ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10845 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10846 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm9 # 16-byte Folded Reload
10847 ; AVX-NEXT: # xmm9 = xmm8[2],mem[2],xmm8[3],mem[3]
10848 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10849 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm8[1],xmm9[1]
10850 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10851 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8, %xmm11 # 16-byte Folded Reload
10852 ; AVX-NEXT: # xmm11 = xmm8[2],mem[2],xmm8[3],mem[3]
10853 ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
10854 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
10855 ; AVX-NEXT: # xmm8 = mem[2,3,2,3]
10856 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
10857 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm11[6,7]
10858 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
10859 ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10860 ; AVX-NEXT: vunpckhps {{.*#+}} xmm8 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
10861 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10862 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm6[1],xmm8[1]
10863 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10864 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm8 # 16-byte Folded Reload
10865 ; AVX-NEXT: # xmm8 = xmm6[2],mem[2],xmm6[3],mem[3]
10866 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
10867 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
10868 ; AVX-NEXT: # xmm9 = mem[2,3,2,3]
10869 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
10870 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
10871 ; AVX-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
10872 ; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10873 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm7 # 16-byte Folded Reload
10874 ; AVX-NEXT: # xmm7 = xmm4[2],mem[2],xmm4[3],mem[3]
10875 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10876 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm4[1],xmm7[1]
10877 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10878 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm8 # 16-byte Folded Reload
10879 ; AVX-NEXT: # xmm8 = xmm4[2],mem[2],xmm4[3],mem[3]
10880 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
10881 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
10882 ; AVX-NEXT: # xmm6 = mem[2,3,2,3]
10883 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
10884 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm8[6,7]
10885 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
10886 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10887 ; AVX-NEXT: vunpckhps {{.*#+}} xmm6 = xmm2[2],xmm5[2],xmm2[3],xmm5[3]
10888 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10889 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm4 = xmm2[1],xmm6[1]
10890 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10891 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm6 # 16-byte Folded Reload
10892 ; AVX-NEXT: # xmm6 = xmm2[2],mem[2],xmm2[3],mem[3]
10893 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
10894 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
10895 ; AVX-NEXT: # xmm5 = mem[2,3,2,3]
10896 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
10897 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7]
10898 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
10899 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10900 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload
10901 ; AVX-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3]
10902 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
10903 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm2 = xmm2[1],xmm4[1]
10904 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10905 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm4 # 16-byte Folded Reload
10906 ; AVX-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3]
10907 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
10908 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
10909 ; AVX-NEXT: # xmm3 = mem[2,3,2,3]
10910 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
10911 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
10912 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
10913 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10914 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
10915 ; AVX-NEXT: # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3]
10916 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10917 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm2[1]
10918 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10919 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
10920 ; AVX-NEXT: # xmm2 = xmm1[2],mem[2],xmm1[3],mem[3]
10921 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
10922 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10923 ; AVX-NEXT: # xmm1 = mem[2,3,2,3]
10924 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
10925 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
10926 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10927 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10928 ; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm14[2],xmm15[2],xmm14[3],xmm15[3]
10929 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10930 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
10931 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10932 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
10933 ; AVX-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
10934 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
10935 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
10936 ; AVX-NEXT: # xmm2 = mem[2,3,2,3]
10937 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
10938 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10939 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10940 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10941 ; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm10[2],xmm12[2],xmm10[3],xmm12[3]
10942 ; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
10943 ; AVX-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1]
10944 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload
10945 ; AVX-NEXT: # xmm1 = xmm13[2],mem[2],xmm13[3],mem[3]
10946 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
10947 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
10948 ; AVX-NEXT: # xmm2 = mem[2,3,2,3]
10949 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
10950 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10951 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10952 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10953 ; AVX-NEXT: vmovaps 416(%rdi), %ymm2
10954 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10955 ; AVX-NEXT: vmovaps 384(%rdi), %ymm3
10956 ; AVX-NEXT: vmovaps 448(%rdi), %ymm1
10957 ; AVX-NEXT: vmovaps 480(%rdi), %ymm0
10958 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10959 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
10960 ; AVX-NEXT: vmovaps %ymm1, %ymm4
10961 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
10962 ; AVX-NEXT: vmovaps %ymm3, %ymm15
10963 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
10964 ; AVX-NEXT: vmovaps 288(%rdi), %ymm6
10965 ; AVX-NEXT: vmovaps 256(%rdi), %ymm1
10966 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10967 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm6[0],ymm1[1],ymm6[1],ymm1[4],ymm6[4],ymm1[5],ymm6[5]
10968 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10969 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
10970 ; AVX-NEXT: vmovaps 320(%rdi), %ymm2
10971 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10972 ; AVX-NEXT: vmovaps 352(%rdi), %ymm5
10973 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm5[0],ymm2[0],ymm5[2],ymm2[2]
10974 ; AVX-NEXT: vmovaps %ymm5, %ymm9
10975 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
10976 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
10977 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
10978 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
10979 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10980 ; AVX-NEXT: vmovaps 672(%rdi), %ymm12
10981 ; AVX-NEXT: vmovaps 640(%rdi), %ymm8
10982 ; AVX-NEXT: vmovaps 704(%rdi), %ymm1
10983 ; AVX-NEXT: vmovaps 736(%rdi), %ymm0
10984 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10985 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
10986 ; AVX-NEXT: vmovaps %ymm1, %ymm13
10987 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10988 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm8[0],ymm12[0],ymm8[1],ymm12[1],ymm8[4],ymm12[4],ymm8[5],ymm12[5]
10989 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
10990 ; AVX-NEXT: vmovaps 544(%rdi), %ymm1
10991 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10992 ; AVX-NEXT: vmovaps 512(%rdi), %ymm2
10993 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10994 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
10995 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
10996 ; AVX-NEXT: vmovaps 576(%rdi), %ymm11
10997 ; AVX-NEXT: vmovaps 608(%rdi), %ymm7
10998 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm7[0],ymm11[0],ymm7[2],ymm11[2]
10999 ; AVX-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11000 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
11001 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
11002 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
11003 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11004 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11005 ; AVX-NEXT: vmovaps 928(%rdi), %ymm2
11006 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11007 ; AVX-NEXT: vmovaps 896(%rdi), %ymm3
11008 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11009 ; AVX-NEXT: vmovaps 960(%rdi), %ymm1
11010 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11011 ; AVX-NEXT: vmovaps 992(%rdi), %ymm0
11012 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11013 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
11014 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
11015 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
11016 ; AVX-NEXT: vmovaps 800(%rdi), %ymm2
11017 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11018 ; AVX-NEXT: vmovaps 768(%rdi), %ymm1
11019 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11020 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
11021 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
11022 ; AVX-NEXT: vmovaps 832(%rdi), %ymm3
11023 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11024 ; AVX-NEXT: vmovaps 864(%rdi), %ymm2
11025 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11026 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
11027 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
11028 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
11029 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
11030 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11031 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11032 ; AVX-NEXT: vmovaps 1184(%rdi), %ymm2
11033 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11034 ; AVX-NEXT: vmovaps 1152(%rdi), %ymm1
11035 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11036 ; AVX-NEXT: vmovaps 1216(%rdi), %ymm0
11037 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11038 ; AVX-NEXT: vmovaps 1248(%rdi), %ymm3
11039 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11040 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[2],ymm0[2]
11041 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
11042 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
11043 ; AVX-NEXT: vmovaps 1056(%rdi), %ymm1
11044 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11045 ; AVX-NEXT: vmovaps 1024(%rdi), %ymm2
11046 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11047 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
11048 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
11049 ; AVX-NEXT: vmovaps 1088(%rdi), %ymm3
11050 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11051 ; AVX-NEXT: vmovaps 1120(%rdi), %ymm2
11052 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11053 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
11054 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
11055 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
11056 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
11057 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11058 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11059 ; AVX-NEXT: vmovaps 1440(%rdi), %ymm2
11060 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11061 ; AVX-NEXT: vmovaps 1408(%rdi), %ymm3
11062 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11063 ; AVX-NEXT: vmovaps 1472(%rdi), %ymm1
11064 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11065 ; AVX-NEXT: vmovaps 1504(%rdi), %ymm0
11066 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11067 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
11068 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
11069 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
11070 ; AVX-NEXT: vmovaps 1312(%rdi), %ymm2
11071 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11072 ; AVX-NEXT: vmovaps 1280(%rdi), %ymm1
11073 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11074 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
11075 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
11076 ; AVX-NEXT: vmovaps 1344(%rdi), %ymm3
11077 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11078 ; AVX-NEXT: vmovaps 1376(%rdi), %ymm2
11079 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11080 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
11081 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
11082 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
11083 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
11084 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11085 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11086 ; AVX-NEXT: vmovaps 1696(%rdi), %ymm2
11087 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11088 ; AVX-NEXT: vmovaps 1664(%rdi), %ymm3
11089 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11090 ; AVX-NEXT: vmovaps 1728(%rdi), %ymm1
11091 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11092 ; AVX-NEXT: vmovaps 1760(%rdi), %ymm0
11093 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11094 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
11095 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
11096 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
11097 ; AVX-NEXT: vmovaps 1568(%rdi), %ymm2
11098 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11099 ; AVX-NEXT: vmovaps 1536(%rdi), %ymm1
11100 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11101 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
11102 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
11103 ; AVX-NEXT: vmovaps 1600(%rdi), %ymm3
11104 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11105 ; AVX-NEXT: vmovaps 1632(%rdi), %ymm2
11106 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11107 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
11108 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
11109 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
11110 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
11111 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11112 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11113 ; AVX-NEXT: vmovaps 1952(%rdi), %ymm2
11114 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11115 ; AVX-NEXT: vmovaps 1920(%rdi), %ymm3
11116 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11117 ; AVX-NEXT: vmovaps 1984(%rdi), %ymm1
11118 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11119 ; AVX-NEXT: vmovaps 2016(%rdi), %ymm0
11120 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11121 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
11122 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
11123 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
11124 ; AVX-NEXT: vmovaps 1824(%rdi), %ymm2
11125 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11126 ; AVX-NEXT: vmovaps 1792(%rdi), %ymm1
11127 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11128 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
11129 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
11130 ; AVX-NEXT: vmovaps 1856(%rdi), %ymm3
11131 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11132 ; AVX-NEXT: vmovaps 1888(%rdi), %ymm2
11133 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11134 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[2],ymm3[2]
11135 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
11136 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
11137 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
11138 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11139 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11140 ; AVX-NEXT: vmovaps 160(%rdi), %ymm2
11141 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11142 ; AVX-NEXT: vmovaps 128(%rdi), %ymm3
11143 ; AVX-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11144 ; AVX-NEXT: vmovaps 192(%rdi), %ymm1
11145 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11146 ; AVX-NEXT: vmovaps 224(%rdi), %ymm0
11147 ; AVX-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
11148 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
11149 ; AVX-NEXT: vunpcklps {{.*#+}} ymm1 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
11150 ; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm1[0,1],ymm0[2,0],ymm1[4,5],ymm0[6,4]
11151 ; AVX-NEXT: vmovaps 64(%rdi), %ymm1
11152 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11153 ; AVX-NEXT: vmovaps 96(%rdi), %ymm0
11154 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11155 ; AVX-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
11156 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,2,0,4,5,6,4]
11157 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
11158 ; AVX-NEXT: vmovaps (%rdi), %ymm1
11159 ; AVX-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11160 ; AVX-NEXT: vmovaps 32(%rdi), %ymm2
11161 ; AVX-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11162 ; AVX-NEXT: vunpcklps {{.*#+}} ymm14 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[4],ymm2[4],ymm1[5],ymm2[5]
11163 ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
11164 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm14[0,1],xmm0[2,3]
11165 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
11166 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11167 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11168 ; AVX-NEXT: vmovaps %ymm4, %ymm5
11169 ; AVX-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11170 ; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[4],ymm3[4],ymm4[5],ymm3[5]
11171 ; AVX-NEXT: vmovaps %ymm15, %ymm4
11172 ; AVX-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11173 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11174 ; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm1[1,0],ymm15[1,0],ymm1[5,4],ymm15[5,4]
11175 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7]
11176 ; AVX-NEXT: vmovaps %ymm9, %ymm15
11177 ; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11178 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11179 ; AVX-NEXT: vunpcklps {{.*#+}} ymm10 = ymm9[0],ymm15[0],ymm9[1],ymm15[1],ymm9[4],ymm15[4],ymm9[5],ymm15[5]
11180 ; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
11181 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11182 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm6[1,0],ymm2[1,0],ymm6[5,4],ymm2[5,4]
11183 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
11184 ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
11185 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
11186 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
11187 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11188 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11189 ; AVX-NEXT: vunpcklps {{.*#+}} ymm0 = ymm13[0],ymm6[0],ymm13[1],ymm6[1],ymm13[4],ymm6[4],ymm13[5],ymm6[5]
11190 ; AVX-NEXT: vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11191 ; AVX-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11192 ; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm12[1,0],ymm8[1,0],ymm12[5,4],ymm8[5,4]
11193 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7]
11194 ; AVX-NEXT: vmovaps %ymm11, %ymm13
11195 ; AVX-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11196 ; AVX-NEXT: vunpcklps {{.*#+}} ymm10 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[4],ymm7[4],ymm11[5],ymm7[5]
11197 ; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
11198 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
11199 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11200 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm7[1,0],ymm11[1,0],ymm7[5,4],ymm11[5,4]
11201 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
11202 ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
11203 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
11204 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
11205 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11206 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11207 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11208 ; AVX-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
11209 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11210 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11211 ; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[1,0],ymm7[1,0],ymm10[5,4],ymm7[5,4]
11212 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7]
11213 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11214 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
11215 ; AVX-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5]
11216 ; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
11217 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11218 ; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
11219 ; AVX-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4]
11220 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
11221 ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
11222 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
11223 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
11224 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11225 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11226 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11227 ; AVX-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
11228 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11229 ; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
11230 ; AVX-NEXT: # ymm10 = ymm10[1,0],mem[1,0],ymm10[5,4],mem[5,4]
11231 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7]
11232 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11233 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
11234 ; AVX-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5]
11235 ; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
11236 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11237 ; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
11238 ; AVX-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4]
11239 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
11240 ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
11241 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
11242 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
11243 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11244 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11245 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11246 ; AVX-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
11247 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11248 ; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
11249 ; AVX-NEXT: # ymm10 = ymm10[1,0],mem[1,0],ymm10[5,4],mem[5,4]
11250 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7]
11251 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11252 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
11253 ; AVX-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5]
11254 ; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
11255 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11256 ; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
11257 ; AVX-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4]
11258 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
11259 ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
11260 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
11261 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
11262 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11263 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11264 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11265 ; AVX-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
11266 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11267 ; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
11268 ; AVX-NEXT: # ymm10 = ymm10[1,0],mem[1,0],ymm10[5,4],mem[5,4]
11269 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7]
11270 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11271 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
11272 ; AVX-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5]
11273 ; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
11274 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11275 ; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
11276 ; AVX-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4]
11277 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
11278 ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
11279 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
11280 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
11281 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11282 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11283 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11284 ; AVX-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
11285 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11286 ; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
11287 ; AVX-NEXT: # ymm10 = ymm10[1,0],mem[1,0],ymm10[5,4],mem[5,4]
11288 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7]
11289 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11290 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
11291 ; AVX-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5]
11292 ; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
11293 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11294 ; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
11295 ; AVX-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4]
11296 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
11297 ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
11298 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
11299 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
11300 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11301 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11302 ; AVX-NEXT: vunpcklps (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
11303 ; AVX-NEXT: # ymm0 = ymm0[0],mem[0],ymm0[1],mem[1],ymm0[4],mem[4],ymm0[5],mem[5]
11304 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11305 ; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
11306 ; AVX-NEXT: # ymm10 = ymm10[1,0],mem[1,0],ymm10[5,4],mem[5,4]
11307 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7]
11308 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11309 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
11310 ; AVX-NEXT: # ymm10 = ymm10[0],mem[0],ymm10[1],mem[1],ymm10[4],mem[4],ymm10[5],mem[5]
11311 ; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
11312 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11313 ; AVX-NEXT: vshufps $17, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
11314 ; AVX-NEXT: # ymm14 = ymm14[1,0],mem[1,0],ymm14[5,4],mem[5,4]
11315 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
11316 ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
11317 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
11318 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
11319 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11320 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm3[1],ymm5[1],ymm3[3],ymm5[3]
11321 ; AVX-NEXT: vunpckhps {{.*#+}} ymm10 = ymm4[2],ymm1[2],ymm4[3],ymm1[3],ymm4[6],ymm1[6],ymm4[7],ymm1[7]
11322 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4]
11323 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm10 # 32-byte Folded Reload
11324 ; AVX-NEXT: # ymm10 = ymm2[2],mem[2],ymm2[3],mem[3],ymm2[6],mem[6],ymm2[7],mem[7]
11325 ; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
11326 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm15[1],ymm9[1],ymm15[3],ymm9[3]
11327 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
11328 ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
11329 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3]
11330 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
11331 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11332 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm0 # 32-byte Folded Reload
11333 ; AVX-NEXT: # ymm0 = ymm6[1],mem[1],ymm6[3],mem[3]
11334 ; AVX-NEXT: vunpckhps {{.*#+}} ymm10 = ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[6],ymm12[6],ymm8[7],ymm12[7]
11335 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4]
11336 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
11337 ; AVX-NEXT: vunpckhps {{.*#+}} ymm10 = ymm11[2],ymm12[2],ymm11[3],ymm12[3],ymm11[6],ymm12[6],ymm11[7],ymm12[7]
11338 ; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
11339 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11340 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm1[1],ymm13[1],ymm1[3],ymm13[3]
11341 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
11342 ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
11343 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3]
11344 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
11345 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11346 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
11347 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11348 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm4[1],ymm6[3],ymm4[3]
11349 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
11350 ; AVX-NEXT: vunpckhps {{.*#+}} ymm10 = ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[6],ymm5[6],ymm7[7],ymm5[7]
11351 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4]
11352 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
11353 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
11354 ; AVX-NEXT: vunpckhps {{.*#+}} ymm10 = ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[6],ymm11[6],ymm13[7],ymm11[7]
11355 ; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
11356 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
11357 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11358 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm14 = ymm3[1],ymm2[1],ymm3[3],ymm2[3]
11359 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
11360 ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
11361 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3]
11362 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
11363 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11364 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
11365 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm0 # 32-byte Folded Reload
11366 ; AVX-NEXT: # ymm0 = ymm8[1],mem[1],ymm8[3],mem[3]
11367 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
11368 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
11369 ; AVX-NEXT: vunpckhps {{.*#+}} ymm10 = ymm9[2],ymm7[2],ymm9[3],ymm7[3],ymm9[6],ymm7[6],ymm9[7],ymm7[7]
11370 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4]
11371 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
11372 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm10 # 32-byte Folded Reload
11373 ; AVX-NEXT: # ymm10 = ymm15[2],mem[2],ymm15[3],mem[3],ymm15[6],mem[6],ymm15[7],mem[7]
11374 ; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
11375 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11376 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
11377 ; AVX-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3]
11378 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
11379 ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
11380 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3]
11381 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
11382 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11383 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11384 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11385 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
11386 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11387 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
11388 ; AVX-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7]
11389 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4]
11390 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11391 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
11392 ; AVX-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7]
11393 ; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
11394 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11395 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
11396 ; AVX-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3]
11397 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
11398 ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
11399 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3]
11400 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
11401 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11402 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11403 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11404 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
11405 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11406 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
11407 ; AVX-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7]
11408 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4]
11409 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11410 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
11411 ; AVX-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7]
11412 ; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
11413 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11414 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
11415 ; AVX-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3]
11416 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
11417 ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
11418 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3]
11419 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
11420 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11421 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11422 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11423 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
11424 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11425 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
11426 ; AVX-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7]
11427 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4]
11428 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11429 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
11430 ; AVX-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7]
11431 ; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
11432 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11433 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
11434 ; AVX-NEXT: # ymm14 = ymm14[1],mem[1],ymm14[3],mem[3]
11435 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[0,1,2,0,4,5,6,4]
11436 ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
11437 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0,1],xmm14[2,3]
11438 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
11439 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11440 ; AVX-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
11441 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11442 ; AVX-NEXT: # ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
11443 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11444 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
11445 ; AVX-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7]
11446 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[0,1],ymm0[2,0],ymm10[4,5],ymm0[6,4]
11447 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11448 ; AVX-NEXT: vunpckhpd {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
11449 ; AVX-NEXT: # ymm10 = ymm10[1],mem[1],ymm10[3],mem[3]
11450 ; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm10[0,1,2,0,4,5,6,4]
11451 ; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
11452 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11453 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
11454 ; AVX-NEXT: # ymm14 = ymm14[2],mem[2],ymm14[3],mem[3],ymm14[6],mem[6],ymm14[7],mem[7]
11455 ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
11456 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
11457 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
11458 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11459 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11460 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11461 ; AVX-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
11462 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11463 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
11464 ; AVX-NEXT: # ymm10 = ymm10[3,0],mem[3,0],ymm10[7,4],mem[7,4]
11465 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7]
11466 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11467 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
11468 ; AVX-NEXT: # ymm10 = ymm10[2],mem[2],ymm10[3],mem[3],ymm10[6],mem[6],ymm10[7],mem[7]
11469 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
11470 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm14 # 32-byte Folded Reload
11471 ; AVX-NEXT: # ymm14 = ymm14[3,0],mem[3,0],ymm14[7,4],mem[7,4]
11472 ; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
11473 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
11474 ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
11475 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
11476 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm0[4,5,6,7]
11477 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11478 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11479 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11480 ; AVX-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
11481 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11482 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10, %ymm10 # 32-byte Folded Reload
11483 ; AVX-NEXT: # ymm10 = ymm10[3,0],mem[3,0],ymm10[7,4],mem[7,4]
11484 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7]
11485 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
11486 ; AVX-NEXT: vunpckhps {{.*#+}} ymm10 = ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[6],ymm1[6],ymm10[7],ymm1[7]
11487 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12, %ymm14 # 32-byte Folded Reload
11488 ; AVX-NEXT: # ymm14 = ymm12[3,0],mem[3,0],ymm12[7,4],mem[7,4]
11489 ; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
11490 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
11491 ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
11492 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
11493 ; AVX-NEXT: vblendps {{.*#+}} ymm12 = ymm10[0,1,2,3],ymm0[4,5,6,7]
11494 ; AVX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm4[2],ymm6[2],ymm4[3],ymm6[3],ymm4[6],ymm6[6],ymm4[7],ymm6[7]
11495 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm10 # 32-byte Folded Reload
11496 ; AVX-NEXT: # ymm10 = ymm5[3,0],mem[3,0],ymm5[7,4],mem[7,4]
11497 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7]
11498 ; AVX-NEXT: vunpckhps {{.*#+}} ymm10 = ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[6],ymm3[6],ymm2[7],ymm3[7]
11499 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm11[3,0],ymm13[3,0],ymm11[7,4],ymm13[7,4]
11500 ; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
11501 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
11502 ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
11503 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
11504 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm0[4,5,6,7]
11505 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11506 ; AVX-NEXT: vunpckhps {{.*#+}} ymm0 = ymm0[2],ymm8[2],ymm0[3],ymm8[3],ymm0[6],ymm8[6],ymm0[7],ymm8[7]
11507 ; AVX-NEXT: vshufps {{.*#+}} ymm10 = ymm7[3,0],ymm9[3,0],ymm7[7,4],ymm9[7,4]
11508 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7]
11509 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11510 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
11511 ; AVX-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
11512 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11513 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm1[3,0],ymm15[3,0],ymm1[7,4],ymm15[7,4]
11514 ; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
11515 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
11516 ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
11517 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
11518 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm10[0,1,2,3],ymm0[4,5,6,7]
11519 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11520 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11521 ; AVX-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
11522 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11523 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
11524 ; AVX-NEXT: # ymm10 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
11525 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm10[2,0],ymm0[2,3],ymm10[6,4],ymm0[6,7]
11526 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11527 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
11528 ; AVX-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
11529 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11530 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
11531 ; AVX-NEXT: # ymm14 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
11532 ; AVX-NEXT: vextractf128 $1, %ymm10, %xmm10
11533 ; AVX-NEXT: vshufps {{.*#+}} ymm14 = ymm14[2,0,2,3,6,4,6,7]
11534 ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
11535 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm14[0,1],xmm10[2,3]
11536 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm0[4,5,6,7]
11537 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11538 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11539 ; AVX-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
11540 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11541 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
11542 ; AVX-NEXT: # ymm14 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
11543 ; AVX-NEXT: vshufps {{.*#+}} ymm1 = ymm14[2,0],ymm0[2,3],ymm14[6,4],ymm0[6,7]
11544 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11545 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm14 # 32-byte Folded Reload
11546 ; AVX-NEXT: # ymm14 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
11547 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11548 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11549 ; AVX-NEXT: # ymm0 = ymm0[3,0],mem[3,0],ymm0[7,4],mem[7,4]
11550 ; AVX-NEXT: vextractf128 $1, %ymm14, %xmm14
11551 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,0,2,3,6,4,6,7]
11552 ; AVX-NEXT: vextractf128 $1, %ymm0, %xmm0
11553 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3]
11554 ; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11555 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11556 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
11557 ; AVX-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
11558 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11559 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
11560 ; AVX-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
11561 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
11562 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11563 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
11564 ; AVX-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
11565 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11566 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm5 # 32-byte Folded Reload
11567 ; AVX-NEXT: # ymm5 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4]
11568 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
11569 ; AVX-NEXT: vshufps {{.*#+}} ymm5 = ymm5[2,0,2,3,6,4,6,7]
11570 ; AVX-NEXT: vextractf128 $1, %ymm5, %xmm5
11571 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
11572 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11573 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
11574 ; AVX-NEXT: vunpckhps (%rsp), %ymm0, %ymm0 # 32-byte Folded Reload
11575 ; AVX-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
11576 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11577 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
11578 ; AVX-NEXT: # ymm1 = ymm1[3,0],mem[3,0],ymm1[7,4],mem[7,4]
11579 ; AVX-NEXT: vshufps {{.*#+}} ymm0 = ymm1[2,0],ymm0[2,3],ymm1[6,4],ymm0[6,7]
11580 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11581 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
11582 ; AVX-NEXT: # ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
11583 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
11584 ; AVX-NEXT: vshufps $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
11585 ; AVX-NEXT: # ymm2 = ymm2[3,0],mem[3,0],ymm2[7,4],mem[7,4]
11586 ; AVX-NEXT: vextractf128 $1, %ymm1, %xmm1
11587 ; AVX-NEXT: vshufps {{.*#+}} ymm2 = ymm2[2,0,2,3,6,4,6,7]
11588 ; AVX-NEXT: vextractf128 $1, %ymm2, %xmm2
11589 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
11590 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
11591 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11592 ; AVX-NEXT: vmovaps %ymm1, 192(%rsi)
11593 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11594 ; AVX-NEXT: vmovaps %ymm1, 128(%rsi)
11595 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11596 ; AVX-NEXT: vmovaps %ymm1, 64(%rsi)
11597 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11598 ; AVX-NEXT: vmovaps %ymm1, (%rsi)
11599 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11600 ; AVX-NEXT: vmovaps %ymm1, 224(%rsi)
11601 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11602 ; AVX-NEXT: vmovaps %ymm1, 160(%rsi)
11603 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11604 ; AVX-NEXT: vmovaps %ymm1, 96(%rsi)
11605 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11606 ; AVX-NEXT: vmovaps %ymm1, 32(%rsi)
11607 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11608 ; AVX-NEXT: vmovaps %ymm1, 192(%rdx)
11609 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11610 ; AVX-NEXT: vmovaps %ymm1, 128(%rdx)
11611 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11612 ; AVX-NEXT: vmovaps %ymm1, 64(%rdx)
11613 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11614 ; AVX-NEXT: vmovaps %ymm1, (%rdx)
11615 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11616 ; AVX-NEXT: vmovaps %ymm1, 224(%rdx)
11617 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11618 ; AVX-NEXT: vmovaps %ymm1, 160(%rdx)
11619 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11620 ; AVX-NEXT: vmovaps %ymm1, 96(%rdx)
11621 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11622 ; AVX-NEXT: vmovaps %ymm1, 32(%rdx)
11623 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11624 ; AVX-NEXT: vmovaps %ymm1, 192(%rcx)
11625 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11626 ; AVX-NEXT: vmovaps %ymm1, 128(%rcx)
11627 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11628 ; AVX-NEXT: vmovaps %ymm1, 64(%rcx)
11629 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11630 ; AVX-NEXT: vmovaps %ymm1, (%rcx)
11631 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11632 ; AVX-NEXT: vmovaps %ymm1, 224(%rcx)
11633 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11634 ; AVX-NEXT: vmovaps %ymm1, 160(%rcx)
11635 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11636 ; AVX-NEXT: vmovaps %ymm1, 96(%rcx)
11637 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11638 ; AVX-NEXT: vmovaps %ymm1, 32(%rcx)
11639 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11640 ; AVX-NEXT: vmovaps %ymm1, 192(%r8)
11641 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11642 ; AVX-NEXT: vmovaps %ymm1, 128(%r8)
11643 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11644 ; AVX-NEXT: vmovaps %ymm1, 64(%r8)
11645 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11646 ; AVX-NEXT: vmovaps %ymm1, (%r8)
11647 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11648 ; AVX-NEXT: vmovaps %ymm1, 224(%r8)
11649 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11650 ; AVX-NEXT: vmovaps %ymm1, 160(%r8)
11651 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11652 ; AVX-NEXT: vmovaps %ymm1, 96(%r8)
11653 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11654 ; AVX-NEXT: vmovaps %ymm1, 32(%r8)
11655 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11656 ; AVX-NEXT: vmovaps %ymm1, 224(%r9)
11657 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11658 ; AVX-NEXT: vmovaps %ymm1, 192(%r9)
11659 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11660 ; AVX-NEXT: vmovaps %ymm1, 160(%r9)
11661 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11662 ; AVX-NEXT: vmovaps %ymm1, 128(%r9)
11663 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11664 ; AVX-NEXT: vmovaps %ymm1, 96(%r9)
11665 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11666 ; AVX-NEXT: vmovaps %ymm1, 64(%r9)
11667 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11668 ; AVX-NEXT: vmovaps %ymm1, 32(%r9)
11669 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11670 ; AVX-NEXT: vmovaps %ymm1, (%r9)
11671 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
11672 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11673 ; AVX-NEXT: vmovaps %ymm1, 224(%rax)
11674 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11675 ; AVX-NEXT: vmovaps %ymm1, 192(%rax)
11676 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11677 ; AVX-NEXT: vmovaps %ymm1, 160(%rax)
11678 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11679 ; AVX-NEXT: vmovaps %ymm1, 128(%rax)
11680 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11681 ; AVX-NEXT: vmovaps %ymm1, 96(%rax)
11682 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11683 ; AVX-NEXT: vmovaps %ymm1, 64(%rax)
11684 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11685 ; AVX-NEXT: vmovaps %ymm1, 32(%rax)
11686 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11687 ; AVX-NEXT: vmovaps %ymm1, (%rax)
11688 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
11689 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11690 ; AVX-NEXT: vmovaps %ymm1, 224(%rax)
11691 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11692 ; AVX-NEXT: vmovaps %ymm1, 192(%rax)
11693 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11694 ; AVX-NEXT: vmovaps %ymm1, 160(%rax)
11695 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11696 ; AVX-NEXT: vmovaps %ymm1, 128(%rax)
11697 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11698 ; AVX-NEXT: vmovaps %ymm1, 96(%rax)
11699 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11700 ; AVX-NEXT: vmovaps %ymm1, 64(%rax)
11701 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11702 ; AVX-NEXT: vmovaps %ymm1, 32(%rax)
11703 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11704 ; AVX-NEXT: vmovaps %ymm1, (%rax)
11705 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
11706 ; AVX-NEXT: vmovaps %ymm5, 224(%rax)
11707 ; AVX-NEXT: vmovaps %ymm14, 192(%rax)
11708 ; AVX-NEXT: vmovaps %ymm10, 160(%rax)
11709 ; AVX-NEXT: vmovaps %ymm4, 128(%rax)
11710 ; AVX-NEXT: vmovaps %ymm3, 96(%rax)
11711 ; AVX-NEXT: vmovaps %ymm12, 64(%rax)
11712 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
11713 ; AVX-NEXT: vmovaps %ymm1, 32(%rax)
11714 ; AVX-NEXT: vmovaps %ymm0, (%rax)
11715 ; AVX-NEXT: addq $3720, %rsp # imm = 0xE88
11716 ; AVX-NEXT: vzeroupper
11719 ; AVX2-LABEL: load_i32_stride8_vf64:
11721 ; AVX2-NEXT: subq $3528, %rsp # imm = 0xDC8
11722 ; AVX2-NEXT: vmovaps 288(%rdi), %xmm10
11723 ; AVX2-NEXT: vmovaps 256(%rdi), %xmm0
11724 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11725 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
11726 ; AVX2-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11727 ; AVX2-NEXT: vmovaps 352(%rdi), %xmm2
11728 ; AVX2-NEXT: vbroadcastss %xmm2, %xmm1
11729 ; AVX2-NEXT: vmovaps %xmm2, %xmm9
11730 ; AVX2-NEXT: vmovaps 320(%rdi), %xmm2
11731 ; AVX2-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill
11732 ; AVX2-NEXT: vbroadcastss %xmm2, %xmm2
11733 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11734 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
11735 ; AVX2-NEXT: vmovaps 416(%rdi), %xmm1
11736 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11737 ; AVX2-NEXT: vmovaps 384(%rdi), %xmm2
11738 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11739 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11740 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
11741 ; AVX2-NEXT: vmovaps 480(%rdi), %xmm3
11742 ; AVX2-NEXT: vbroadcastss %xmm3, %xmm2
11743 ; AVX2-NEXT: vmovaps %xmm3, %xmm13
11744 ; AVX2-NEXT: vmovaps 448(%rdi), %xmm3
11745 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11746 ; AVX2-NEXT: vbroadcastss %xmm3, %xmm3
11747 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
11748 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
11749 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
11750 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11751 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11752 ; AVX2-NEXT: vmovaps 800(%rdi), %xmm0
11753 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11754 ; AVX2-NEXT: vmovaps 768(%rdi), %xmm1
11755 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11756 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11757 ; AVX2-NEXT: vmovaps 864(%rdi), %xmm12
11758 ; AVX2-NEXT: vbroadcastss %xmm12, %xmm1
11759 ; AVX2-NEXT: vmovaps 832(%rdi), %xmm2
11760 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11761 ; AVX2-NEXT: vbroadcastss %xmm2, %xmm2
11762 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11763 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
11764 ; AVX2-NEXT: vmovaps 992(%rdi), %xmm1
11765 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11766 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
11767 ; AVX2-NEXT: vmovaps 960(%rdi), %xmm2
11768 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11769 ; AVX2-NEXT: vbroadcastss %xmm2, %xmm2
11770 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11771 ; AVX2-NEXT: vmovaps 928(%rdi), %xmm2
11772 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11773 ; AVX2-NEXT: vmovaps 896(%rdi), %xmm3
11774 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11775 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
11776 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
11777 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
11778 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11779 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11780 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11781 ; AVX2-NEXT: vmovaps 1376(%rdi), %xmm0
11782 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11783 ; AVX2-NEXT: vbroadcastss %xmm0, %xmm0
11784 ; AVX2-NEXT: vmovaps 1344(%rdi), %xmm1
11785 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11786 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
11787 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11788 ; AVX2-NEXT: vmovaps 1312(%rdi), %xmm1
11789 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11790 ; AVX2-NEXT: vmovaps 1280(%rdi), %xmm2
11791 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11792 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11793 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11794 ; AVX2-NEXT: vmovaps 1504(%rdi), %xmm1
11795 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11796 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
11797 ; AVX2-NEXT: vmovaps 1472(%rdi), %xmm2
11798 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11799 ; AVX2-NEXT: vbroadcastss %xmm2, %xmm2
11800 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11801 ; AVX2-NEXT: vmovaps 1440(%rdi), %xmm2
11802 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11803 ; AVX2-NEXT: vmovaps 1408(%rdi), %xmm3
11804 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11805 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
11806 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
11807 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
11808 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11809 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11810 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11811 ; AVX2-NEXT: vmovaps 1888(%rdi), %xmm0
11812 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11813 ; AVX2-NEXT: vbroadcastss %xmm0, %xmm0
11814 ; AVX2-NEXT: vmovaps 1856(%rdi), %xmm1
11815 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11816 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
11817 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11818 ; AVX2-NEXT: vmovaps 1824(%rdi), %xmm1
11819 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11820 ; AVX2-NEXT: vmovaps 1792(%rdi), %xmm2
11821 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11822 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11823 ; AVX2-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
11824 ; AVX2-NEXT: vmovaps 2016(%rdi), %xmm0
11825 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11826 ; AVX2-NEXT: vbroadcastss %xmm0, %xmm1
11827 ; AVX2-NEXT: vmovaps 1984(%rdi), %xmm0
11828 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11829 ; AVX2-NEXT: vbroadcastss %xmm0, %xmm2
11830 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11831 ; AVX2-NEXT: vmovaps 1952(%rdi), %xmm0
11832 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11833 ; AVX2-NEXT: vmovaps 1920(%rdi), %xmm2
11834 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11835 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
11836 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
11837 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
11838 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11839 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7]
11840 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11841 ; AVX2-NEXT: vmovaps 608(%rdi), %xmm0
11842 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11843 ; AVX2-NEXT: vbroadcastss %xmm0, %xmm0
11844 ; AVX2-NEXT: vmovaps 576(%rdi), %xmm1
11845 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11846 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
11847 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11848 ; AVX2-NEXT: vmovaps 544(%rdi), %xmm2
11849 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11850 ; AVX2-NEXT: vmovaps 512(%rdi), %xmm1
11851 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11852 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
11853 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11854 ; AVX2-NEXT: vmovaps 736(%rdi), %xmm1
11855 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11856 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
11857 ; AVX2-NEXT: vmovaps 704(%rdi), %xmm2
11858 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11859 ; AVX2-NEXT: vbroadcastss %xmm2, %xmm2
11860 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11861 ; AVX2-NEXT: vmovaps 672(%rdi), %xmm3
11862 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11863 ; AVX2-NEXT: vmovaps 640(%rdi), %xmm2
11864 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11865 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
11866 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
11867 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
11868 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11869 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11870 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11871 ; AVX2-NEXT: vmovaps 1120(%rdi), %xmm0
11872 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11873 ; AVX2-NEXT: vbroadcastss %xmm0, %xmm0
11874 ; AVX2-NEXT: vmovaps 1088(%rdi), %xmm1
11875 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11876 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
11877 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11878 ; AVX2-NEXT: vmovaps 1056(%rdi), %xmm2
11879 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11880 ; AVX2-NEXT: vmovaps 1024(%rdi), %xmm1
11881 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11882 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
11883 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11884 ; AVX2-NEXT: vmovaps 1248(%rdi), %xmm1
11885 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11886 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
11887 ; AVX2-NEXT: vmovaps 1216(%rdi), %xmm2
11888 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11889 ; AVX2-NEXT: vbroadcastss %xmm2, %xmm2
11890 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11891 ; AVX2-NEXT: vmovaps 1184(%rdi), %xmm3
11892 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11893 ; AVX2-NEXT: vmovaps 1152(%rdi), %xmm2
11894 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11895 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
11896 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
11897 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
11898 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11899 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11900 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11901 ; AVX2-NEXT: vmovaps 1632(%rdi), %xmm0
11902 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11903 ; AVX2-NEXT: vbroadcastss %xmm0, %xmm0
11904 ; AVX2-NEXT: vmovaps 1600(%rdi), %xmm1
11905 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11906 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
11907 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11908 ; AVX2-NEXT: vmovaps 1568(%rdi), %xmm2
11909 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11910 ; AVX2-NEXT: vmovaps 1536(%rdi), %xmm1
11911 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11912 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
11913 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11914 ; AVX2-NEXT: vmovaps 1760(%rdi), %xmm1
11915 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11916 ; AVX2-NEXT: vbroadcastss %xmm1, %xmm1
11917 ; AVX2-NEXT: vmovaps 1728(%rdi), %xmm2
11918 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11919 ; AVX2-NEXT: vbroadcastss %xmm2, %xmm2
11920 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11921 ; AVX2-NEXT: vmovaps 1696(%rdi), %xmm3
11922 ; AVX2-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11923 ; AVX2-NEXT: vmovaps 1664(%rdi), %xmm2
11924 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11925 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
11926 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
11927 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
11928 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11929 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11930 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11931 ; AVX2-NEXT: vmovaps 224(%rdi), %xmm0
11932 ; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11933 ; AVX2-NEXT: vbroadcastss %xmm0, %xmm0
11934 ; AVX2-NEXT: vmovaps 192(%rdi), %xmm11
11935 ; AVX2-NEXT: vbroadcastss %xmm11, %xmm1
11936 ; AVX2-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11937 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11938 ; AVX2-NEXT: vmovaps 160(%rdi), %xmm2
11939 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11940 ; AVX2-NEXT: vmovaps 128(%rdi), %xmm1
11941 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11942 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
11943 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
11944 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
11945 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm0[6,7]
11946 ; AVX2-NEXT: vmovaps 96(%rdi), %xmm8
11947 ; AVX2-NEXT: vbroadcastss %xmm8, %xmm1
11948 ; AVX2-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11949 ; AVX2-NEXT: vmovaps 64(%rdi), %xmm7
11950 ; AVX2-NEXT: vbroadcastss %xmm7, %xmm2
11951 ; AVX2-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11952 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
11953 ; AVX2-NEXT: vmovaps (%rdi), %xmm5
11954 ; AVX2-NEXT: vmovaps 32(%rdi), %xmm6
11955 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
11956 ; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11957 ; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11958 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3]
11959 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7]
11960 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11961 ; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
11962 ; AVX2-NEXT: # xmm0 = mem[1,1,1,1]
11963 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3]
11964 ; AVX2-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11965 ; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
11966 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
11967 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
11968 ; AVX2-NEXT: vmovaps %xmm13, %xmm9
11969 ; AVX2-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11970 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11971 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
11972 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
11973 ; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
11974 ; AVX2-NEXT: # xmm2 = mem[1,1,1,1]
11975 ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
11976 ; AVX2-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
11977 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
11978 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11979 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11980 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11981 ; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
11982 ; AVX2-NEXT: # xmm0 = mem[1,1,1,1]
11983 ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11984 ; AVX2-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
11985 ; AVX2-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11986 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11987 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1]
11988 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
11989 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
11990 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
11991 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
11992 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
11993 ; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
11994 ; AVX2-NEXT: # xmm2 = mem[1,1,1,1]
11995 ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
11996 ; AVX2-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
11997 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
11998 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11999 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12000 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12001 ; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
12002 ; AVX2-NEXT: # xmm0 = mem[1,1,1,1]
12003 ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12004 ; AVX2-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
12005 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12006 ; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
12007 ; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
12008 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12009 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
12010 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
12011 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
12012 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
12013 ; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
12014 ; AVX2-NEXT: # xmm2 = mem[1,1,1,1]
12015 ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
12016 ; AVX2-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
12017 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
12018 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12019 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12020 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12021 ; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
12022 ; AVX2-NEXT: # xmm0 = mem[1,1,1,1]
12023 ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12024 ; AVX2-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
12025 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12026 ; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
12027 ; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
12028 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12029 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12030 ; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
12031 ; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
12032 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
12033 ; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
12034 ; AVX2-NEXT: # xmm2 = mem[1,1,1,1]
12035 ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
12036 ; AVX2-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
12037 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
12038 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12039 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12040 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12041 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1]
12042 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3]
12043 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
12044 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12045 ; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
12046 ; AVX2-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1]
12047 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
12048 ; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
12049 ; AVX2-NEXT: # xmm2 = mem[1,1,1,1]
12050 ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
12051 ; AVX2-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
12052 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
12053 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12054 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12055 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12056 ; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
12057 ; AVX2-NEXT: # xmm0 = mem[1,1,1,1]
12058 ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12059 ; AVX2-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
12060 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12061 ; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
12062 ; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
12063 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12064 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
12065 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
12066 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
12067 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
12068 ; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
12069 ; AVX2-NEXT: # xmm2 = mem[1,1,1,1]
12070 ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
12071 ; AVX2-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
12072 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
12073 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12074 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12075 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12076 ; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
12077 ; AVX2-NEXT: # xmm0 = mem[1,1,1,1]
12078 ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12079 ; AVX2-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
12080 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12081 ; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
12082 ; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
12083 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12084 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12085 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
12086 ; AVX2-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm5[0],xmm11[1],xmm5[1]
12087 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
12088 ; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
12089 ; AVX2-NEXT: # xmm2 = mem[1,1,1,1]
12090 ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
12091 ; AVX2-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
12092 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
12093 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12094 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12095 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12096 ; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
12097 ; AVX2-NEXT: # xmm0 = mem[1,1,1,1]
12098 ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12099 ; AVX2-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
12100 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12101 ; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
12102 ; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
12103 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12104 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12105 ; AVX2-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
12106 ; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
12107 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
12108 ; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
12109 ; AVX2-NEXT: # xmm2 = mem[1,1,1,1]
12110 ; AVX2-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
12111 ; AVX2-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
12112 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
12113 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12114 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12115 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12116 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12117 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
12118 ; AVX2-NEXT: # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3]
12119 ; AVX2-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12120 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12121 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12122 ; AVX2-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
12123 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12124 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm9[2,2,2,2]
12125 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
12126 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
12127 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
12128 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12129 ; AVX2-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
12130 ; AVX2-NEXT: # xmm1 = mem[2,2,2,2]
12131 ; AVX2-NEXT: vblendps $7, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
12132 ; AVX2-NEXT: # xmm1 = mem[0,1,2],xmm1[3]
12133 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
12134 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12135 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12136 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12137 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
12138 ; AVX2-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
12139 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12140 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12141 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12142 ; AVX2-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
12143 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12144 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm12[2,2,2,2]
12145 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3]
12146 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
12147 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
12148 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12149 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
12150 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,2,2,2]
12151 ; AVX2-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
12152 ; AVX2-NEXT: # xmm1 = mem[0,1,2],xmm1[3]
12153 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
12154 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12155 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12156 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12157 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
12158 ; AVX2-NEXT: # xmm10 = xmm0[2],mem[2],xmm0[3],mem[3]
12159 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12160 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12161 ; AVX2-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
12162 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12163 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2]
12164 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
12165 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
12166 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
12167 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12168 ; AVX2-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
12169 ; AVX2-NEXT: # xmm1 = mem[2,2,2,2]
12170 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
12171 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3]
12172 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
12173 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12174 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12175 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12176 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
12177 ; AVX2-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3]
12178 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12179 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12180 ; AVX2-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
12181 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12182 ; AVX2-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
12183 ; AVX2-NEXT: # xmm0 = mem[2,2,2,2]
12184 ; AVX2-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12185 ; AVX2-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
12186 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
12187 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
12188 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12189 ; AVX2-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
12190 ; AVX2-NEXT: # xmm1 = mem[2,2,2,2]
12191 ; AVX2-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
12192 ; AVX2-NEXT: # xmm1 = mem[0,1,2],xmm1[3]
12193 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
12194 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12195 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12196 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12197 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
12198 ; AVX2-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
12199 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12200 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12201 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12202 ; AVX2-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
12203 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12204 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,2,2,2]
12205 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3]
12206 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
12207 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
12208 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12209 ; AVX2-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
12210 ; AVX2-NEXT: # xmm1 = mem[2,2,2,2]
12211 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12212 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3]
12213 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
12214 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12215 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12216 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12217 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
12218 ; AVX2-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
12219 ; AVX2-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12220 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12221 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12222 ; AVX2-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
12223 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12224 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2]
12225 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
12226 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
12227 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
12228 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12229 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
12230 ; AVX2-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2]
12231 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
12232 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
12233 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
12234 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12235 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12236 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12237 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
12238 ; AVX2-NEXT: # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3]
12239 ; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12240 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12241 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12242 ; AVX2-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
12243 ; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12244 ; AVX2-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
12245 ; AVX2-NEXT: # xmm0 = mem[2,2,2,2]
12246 ; AVX2-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12247 ; AVX2-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
12248 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
12249 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm15
12250 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7]
12251 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
12252 ; AVX2-NEXT: vshufps {{.*#+}} xmm15 = xmm2[2,2,2,2]
12253 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12254 ; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3]
12255 ; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm5[0,1],xmm15[2,3]
12256 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
12257 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12258 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12259 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
12260 ; AVX2-NEXT: # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3]
12261 ; AVX2-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12262 ; AVX2-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
12263 ; AVX2-NEXT: # xmm15 = mem[2,2,2,2]
12264 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12265 ; AVX2-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1,2],xmm15[3]
12266 ; AVX2-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
12267 ; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm14
12268 ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
12269 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12270 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload
12271 ; AVX2-NEXT: # xmm15 = xmm5[2],mem[2],xmm5[3],mem[3]
12272 ; AVX2-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
12273 ; AVX2-NEXT: # xmm13 = mem[2,2,2,2]
12274 ; AVX2-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
12275 ; AVX2-NEXT: # xmm13 = mem[0,1,2],xmm13[3]
12276 ; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
12277 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm14[4,5,6,7]
12278 ; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12279 ; AVX2-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload
12280 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload
12281 ; AVX2-NEXT: # xmm13 = xmm5[2],mem[2],xmm5[3],mem[3]
12282 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12283 ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm5[1],xmm13[1]
12284 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12285 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload
12286 ; AVX2-NEXT: # xmm14 = xmm5[2],mem[2],xmm5[3],mem[3]
12287 ; AVX2-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
12288 ; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
12289 ; AVX2-NEXT: # xmm12 = mem[2,3,2,3]
12290 ; AVX2-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
12291 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7]
12292 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
12293 ; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12294 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12295 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm9[2],xmm5[3],xmm9[3]
12296 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12297 ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm5[1],xmm12[1]
12298 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12299 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload
12300 ; AVX2-NEXT: # xmm12 = xmm5[2],mem[2],xmm5[3],mem[3]
12301 ; AVX2-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
12302 ; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
12303 ; AVX2-NEXT: # xmm13 = mem[2,3,2,3]
12304 ; AVX2-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13
12305 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
12306 ; AVX2-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
12307 ; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12308 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm11 # 16-byte Folded Reload
12309 ; AVX2-NEXT: # xmm11 = xmm7[2],mem[2],xmm7[3],mem[3]
12310 ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm10[1],xmm11[1]
12311 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12312 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm11 # 16-byte Folded Reload
12313 ; AVX2-NEXT: # xmm11 = xmm5[2],mem[2],xmm5[3],mem[3]
12314 ; AVX2-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
12315 ; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
12316 ; AVX2-NEXT: # xmm10 = mem[2,3,2,3]
12317 ; AVX2-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
12318 ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
12319 ; AVX2-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
12320 ; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12321 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12322 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload
12323 ; AVX2-NEXT: # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3]
12324 ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm9[1]
12325 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12326 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload
12327 ; AVX2-NEXT: # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3]
12328 ; AVX2-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
12329 ; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
12330 ; AVX2-NEXT: # xmm8 = mem[2,3,2,3]
12331 ; AVX2-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
12332 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
12333 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
12334 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12335 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload
12336 ; AVX2-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3]
12337 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12338 ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1]
12339 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12340 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload
12341 ; AVX2-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3]
12342 ; AVX2-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
12343 ; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
12344 ; AVX2-NEXT: # xmm6 = mem[2,3,2,3]
12345 ; AVX2-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
12346 ; AVX2-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
12347 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
12348 ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12349 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm5 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
12350 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
12351 ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1]
12352 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
12353 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload
12354 ; AVX2-NEXT: # xmm5 = xmm4[2],mem[2],xmm4[3],mem[3]
12355 ; AVX2-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
12356 ; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
12357 ; AVX2-NEXT: # xmm4 = mem[2,3,2,3]
12358 ; AVX2-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
12359 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
12360 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
12361 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12362 ; AVX2-NEXT: vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
12363 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12364 ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
12365 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
12366 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
12367 ; AVX2-NEXT: # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3]
12368 ; AVX2-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
12369 ; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
12370 ; AVX2-NEXT: # xmm2 = mem[2,3,2,3]
12371 ; AVX2-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
12372 ; AVX2-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
12373 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
12374 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12375 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12376 ; AVX2-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
12377 ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
12378 ; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
12379 ; AVX2-NEXT: # xmm0 = mem[2,3,2,3]
12380 ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
12381 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
12382 ; AVX2-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12383 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
12384 ; AVX2-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
12385 ; AVX2-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1]
12386 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12387 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12388 ; AVX2-NEXT: vmovaps 32(%rdi), %ymm0
12389 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12390 ; AVX2-NEXT: vmovaps (%rdi), %ymm1
12391 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12392 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
12393 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
12394 ; AVX2-NEXT: vmovaps 96(%rdi), %ymm1
12395 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12396 ; AVX2-NEXT: vmovaps 64(%rdi), %ymm2
12397 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12398 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
12399 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12400 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
12401 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12402 ; AVX2-NEXT: vmovaps 224(%rdi), %ymm2
12403 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12404 ; AVX2-NEXT: vmovaps 192(%rdi), %ymm3
12405 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12406 ; AVX2-NEXT: vmovaps 160(%rdi), %ymm15
12407 ; AVX2-NEXT: vmovaps 128(%rdi), %ymm1
12408 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12409 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5]
12410 ; AVX2-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12411 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
12412 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12413 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
12414 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12415 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12416 ; AVX2-NEXT: vmovaps 288(%rdi), %ymm0
12417 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12418 ; AVX2-NEXT: vmovaps 256(%rdi), %ymm1
12419 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12420 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
12421 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
12422 ; AVX2-NEXT: vmovaps 352(%rdi), %ymm1
12423 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12424 ; AVX2-NEXT: vmovaps 320(%rdi), %ymm2
12425 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12426 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
12427 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12428 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
12429 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12430 ; AVX2-NEXT: vmovaps 480(%rdi), %ymm2
12431 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12432 ; AVX2-NEXT: vmovaps 448(%rdi), %ymm3
12433 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12434 ; AVX2-NEXT: vmovaps 416(%rdi), %ymm4
12435 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12436 ; AVX2-NEXT: vmovaps 384(%rdi), %ymm1
12437 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12438 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5]
12439 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
12440 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2]
12441 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12442 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12443 ; AVX2-NEXT: vmovaps 544(%rdi), %ymm0
12444 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12445 ; AVX2-NEXT: vmovaps 512(%rdi), %ymm1
12446 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12447 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
12448 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
12449 ; AVX2-NEXT: vmovaps 608(%rdi), %ymm1
12450 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12451 ; AVX2-NEXT: vmovaps 576(%rdi), %ymm2
12452 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12453 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
12454 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12455 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
12456 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12457 ; AVX2-NEXT: vmovaps 736(%rdi), %ymm2
12458 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12459 ; AVX2-NEXT: vmovaps 704(%rdi), %ymm3
12460 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12461 ; AVX2-NEXT: vmovaps 672(%rdi), %ymm4
12462 ; AVX2-NEXT: vmovaps 640(%rdi), %ymm1
12463 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12464 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5]
12465 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12466 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
12467 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2]
12468 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12469 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12470 ; AVX2-NEXT: vmovaps 800(%rdi), %ymm0
12471 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12472 ; AVX2-NEXT: vmovaps 768(%rdi), %ymm1
12473 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12474 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
12475 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
12476 ; AVX2-NEXT: vmovaps 864(%rdi), %ymm1
12477 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12478 ; AVX2-NEXT: vmovaps 832(%rdi), %ymm2
12479 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12480 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
12481 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12482 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
12483 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12484 ; AVX2-NEXT: vmovaps 992(%rdi), %ymm2
12485 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12486 ; AVX2-NEXT: vmovaps 960(%rdi), %ymm5
12487 ; AVX2-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12488 ; AVX2-NEXT: vmovaps 928(%rdi), %ymm3
12489 ; AVX2-NEXT: vmovaps 896(%rdi), %ymm1
12490 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12491 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
12492 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12493 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5]
12494 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2]
12495 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12496 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12497 ; AVX2-NEXT: vmovaps 1056(%rdi), %ymm0
12498 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12499 ; AVX2-NEXT: vmovaps 1024(%rdi), %ymm1
12500 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12501 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
12502 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
12503 ; AVX2-NEXT: vmovaps 1120(%rdi), %ymm1
12504 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12505 ; AVX2-NEXT: vmovaps 1088(%rdi), %ymm2
12506 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12507 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
12508 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12509 ; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
12510 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12511 ; AVX2-NEXT: vmovaps 1248(%rdi), %ymm1
12512 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12513 ; AVX2-NEXT: vmovaps 1216(%rdi), %ymm7
12514 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12515 ; AVX2-NEXT: vmovaps 1184(%rdi), %ymm2
12516 ; AVX2-NEXT: vmovaps 1152(%rdi), %ymm6
12517 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12518 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5]
12519 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12520 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[1],ymm1[1],ymm7[4],ymm1[4],ymm7[5],ymm1[5]
12521 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
12522 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
12523 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12524 ; AVX2-NEXT: vmovaps 1312(%rdi), %ymm0
12525 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12526 ; AVX2-NEXT: vmovaps 1280(%rdi), %ymm6
12527 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12528 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5]
12529 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm6
12530 ; AVX2-NEXT: vmovaps 1376(%rdi), %ymm0
12531 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12532 ; AVX2-NEXT: vmovaps 1344(%rdi), %ymm7
12533 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12534 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[4],ymm0[4],ymm7[5],ymm0[5]
12535 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12536 ; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[2,2,2,2]
12537 ; AVX2-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3]
12538 ; AVX2-NEXT: vmovaps 1504(%rdi), %ymm6
12539 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12540 ; AVX2-NEXT: vmovaps 1472(%rdi), %ymm10
12541 ; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12542 ; AVX2-NEXT: vmovaps 1440(%rdi), %ymm0
12543 ; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
12544 ; AVX2-NEXT: vmovaps 1408(%rdi), %ymm9
12545 ; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12546 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5]
12547 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5]
12548 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[2],ymm0[2]
12549 ; AVX2-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
12550 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12551 ; AVX2-NEXT: vmovaps 1568(%rdi), %ymm6
12552 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12553 ; AVX2-NEXT: vmovaps 1536(%rdi), %ymm7
12554 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12555 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm7 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
12556 ; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm9
12557 ; AVX2-NEXT: vmovaps 1632(%rdi), %ymm6
12558 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12559 ; AVX2-NEXT: vmovaps 1600(%rdi), %ymm7
12560 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12561 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
12562 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12563 ; AVX2-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,2,2,2]
12564 ; AVX2-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0,1],xmm10[2,3]
12565 ; AVX2-NEXT: vmovaps 1760(%rdi), %ymm9
12566 ; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12567 ; AVX2-NEXT: vmovaps 1728(%rdi), %ymm6
12568 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12569 ; AVX2-NEXT: vmovaps 1696(%rdi), %ymm7
12570 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12571 ; AVX2-NEXT: vmovaps 1664(%rdi), %ymm11
12572 ; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12573 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[4],ymm7[4],ymm11[5],ymm7[5]
12574 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5]
12575 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12576 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[2],ymm6[2]
12577 ; AVX2-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
12578 ; AVX2-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12579 ; AVX2-NEXT: vmovaps 1824(%rdi), %ymm6
12580 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12581 ; AVX2-NEXT: vmovaps 1792(%rdi), %ymm7
12582 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12583 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
12584 ; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm11
12585 ; AVX2-NEXT: vmovaps 1888(%rdi), %ymm6
12586 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12587 ; AVX2-NEXT: vmovaps 1856(%rdi), %ymm7
12588 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12589 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
12590 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12591 ; AVX2-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[2,2,2,2]
12592 ; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm11[0,1],xmm13[2,3]
12593 ; AVX2-NEXT: vmovaps 2016(%rdi), %ymm11
12594 ; AVX2-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12595 ; AVX2-NEXT: vmovaps 1984(%rdi), %ymm6
12596 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12597 ; AVX2-NEXT: vmovaps 1952(%rdi), %ymm7
12598 ; AVX2-NEXT: vmovaps 1920(%rdi), %ymm9
12599 ; AVX2-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12600 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5]
12601 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12602 ; AVX2-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm11[0],ymm6[1],ymm11[1],ymm6[4],ymm11[4],ymm6[5],ymm11[5]
12603 ; AVX2-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm11[0],ymm14[2],ymm11[2]
12604 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
12605 ; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12606 ; AVX2-NEXT: vbroadcastss 148(%rdi), %ymm13
12607 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
12608 ; AVX2-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
12609 ; AVX2-NEXT: # ymm13 = ymm13[0,1,2,3,4,5],mem[6,7]
12610 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12611 ; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm14
12612 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
12613 ; AVX2-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5]
12614 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
12615 ; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm6[1],ymm15[2,3,4],ymm6[5],ymm15[6,7]
12616 ; AVX2-NEXT: vextractf128 $1, %ymm15, %xmm15
12617 ; AVX2-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
12618 ; AVX2-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
12619 ; AVX2-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12620 ; AVX2-NEXT: vbroadcastss 404(%rdi), %ymm13
12621 ; AVX2-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
12622 ; AVX2-NEXT: # ymm13 = ymm13[0,1,2,3,4],mem[5],ymm13[6,7]
12623 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
12624 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
12625 ; AVX2-NEXT: vextractf128 $1, %ymm10, %xmm13
12626 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
12627 ; AVX2-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,1,1,1,5,5,5,5]
12628 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
12629 ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7]
12630 ; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm14
12631 ; AVX2-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
12632 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
12633 ; AVX2-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12634 ; AVX2-NEXT: vbroadcastss 660(%rdi), %ymm12
12635 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm4[5],ymm12[6,7]
12636 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm8[6,7]
12637 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
12638 ; AVX2-NEXT: vextractf128 $1, %ymm8, %xmm8
12639 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
12640 ; AVX2-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,1,1,1,5,5,5,5]
12641 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
12642 ; AVX2-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7]
12643 ; AVX2-NEXT: vextractf128 $1, %ymm12, %xmm12
12644 ; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3]
12645 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
12646 ; AVX2-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12647 ; AVX2-NEXT: vbroadcastss 916(%rdi), %ymm4
12648 ; AVX2-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
12649 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm5[6,7]
12650 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12651 ; AVX2-NEXT: vextractf128 $1, %ymm4, %xmm4
12652 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
12653 ; AVX2-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5]
12654 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
12655 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4],ymm12[5],ymm5[6,7]
12656 ; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5
12657 ; AVX2-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
12658 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
12659 ; AVX2-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12660 ; AVX2-NEXT: vbroadcastss 1172(%rdi), %ymm3
12661 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12662 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
12663 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12664 ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2
12665 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12666 ; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1,1,1,5,5,5,5]
12667 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
12668 ; AVX2-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7]
12669 ; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm3
12670 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
12671 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
12672 ; AVX2-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12673 ; AVX2-NEXT: vbroadcastss 1428(%rdi), %ymm1
12674 ; AVX2-NEXT: vblendps $32, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
12675 ; AVX2-NEXT: # ymm1 = ymm1[0,1,2,3,4],mem[5],ymm1[6,7]
12676 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7]
12677 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12678 ; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm0
12679 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
12680 ; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5]
12681 ; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12682 ; AVX2-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
12683 ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2
12684 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
12685 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12686 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12687 ; AVX2-NEXT: vbroadcastss 1684(%rdi), %ymm0
12688 ; AVX2-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12689 ; AVX2-NEXT: # ymm0 = ymm0[0,1,2,3,4],mem[5],ymm0[6,7]
12690 ; AVX2-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12691 ; AVX2-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
12692 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12693 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1
12694 ; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12695 ; AVX2-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5]
12696 ; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12697 ; AVX2-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
12698 ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2
12699 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
12700 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12701 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12702 ; AVX2-NEXT: vbroadcastss 1940(%rdi), %ymm0
12703 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
12704 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
12705 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12706 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1
12707 ; AVX2-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12708 ; AVX2-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5]
12709 ; AVX2-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
12710 ; AVX2-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
12711 ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2
12712 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
12713 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12714 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12715 ; AVX2-NEXT: vbroadcastss 248(%rdi), %ymm0
12716 ; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12717 ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
12718 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12719 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
12720 ; AVX2-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12721 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12722 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7]
12723 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12724 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12725 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
12726 ; AVX2-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12727 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12728 ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm1
12729 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,2,2,2]
12730 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
12731 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
12732 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12733 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12734 ; AVX2-NEXT: vbroadcastss 504(%rdi), %ymm0
12735 ; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12736 ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
12737 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12738 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
12739 ; AVX2-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12740 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12741 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[6],ymm10[6],ymm15[7],ymm10[7]
12742 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12743 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12744 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
12745 ; AVX2-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12746 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12747 ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm1
12748 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2]
12749 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
12750 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
12751 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12752 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12753 ; AVX2-NEXT: vbroadcastss 760(%rdi), %ymm0
12754 ; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12755 ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
12756 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12757 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
12758 ; AVX2-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12759 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12760 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7]
12761 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12762 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12763 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
12764 ; AVX2-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12765 ; AVX2-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12766 ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm1
12767 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2]
12768 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
12769 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
12770 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12771 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12772 ; AVX2-NEXT: vbroadcastss 1016(%rdi), %ymm0
12773 ; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12774 ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
12775 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12776 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
12777 ; AVX2-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12778 ; AVX2-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12779 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[6],ymm12[6],ymm8[7],ymm12[7]
12780 ; AVX2-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12781 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12782 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
12783 ; AVX2-NEXT: # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12784 ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm1
12785 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm14[2,2,2,2]
12786 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
12787 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
12788 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12789 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12790 ; AVX2-NEXT: vbroadcastss 1272(%rdi), %ymm0
12791 ; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12792 ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
12793 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12794 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload
12795 ; AVX2-NEXT: # ymm12 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12796 ; AVX2-NEXT: vunpckhps {{.*#+}} ymm13 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7]
12797 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12798 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload
12799 ; AVX2-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12800 ; AVX2-NEXT: vextractf128 $1, %ymm13, %xmm1
12801 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm11[2,2,2,2]
12802 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
12803 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
12804 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12805 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12806 ; AVX2-NEXT: vbroadcastss 1528(%rdi), %ymm0
12807 ; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12808 ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
12809 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12810 ; AVX2-NEXT: vunpckhps (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload
12811 ; AVX2-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12812 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload
12813 ; AVX2-NEXT: # ymm9 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7]
12814 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12815 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
12816 ; AVX2-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12817 ; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm1
12818 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm8[2,2,2,2]
12819 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
12820 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7]
12821 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12822 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12823 ; AVX2-NEXT: vbroadcastss 1784(%rdi), %ymm0
12824 ; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12825 ; AVX2-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
12826 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12827 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
12828 ; AVX2-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12829 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12830 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
12831 ; AVX2-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12832 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12833 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
12834 ; AVX2-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
12835 ; AVX2-NEXT: vextractf128 $1, %ymm7, %xmm1
12836 ; AVX2-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[2,2,2,2]
12837 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
12838 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
12839 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12840 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12841 ; AVX2-NEXT: vbroadcastss 2040(%rdi), %ymm0
12842 ; AVX2-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
12843 ; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7]
12844 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12845 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
12846 ; AVX2-NEXT: # ymm4 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
12847 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12848 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
12849 ; AVX2-NEXT: # ymm3 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
12850 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
12851 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
12852 ; AVX2-NEXT: # ymm2 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
12853 ; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm0
12854 ; AVX2-NEXT: vpermpd {{.*#+}} ymm15 = ymm2[2,2,2,2]
12855 ; AVX2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3]
12856 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
12857 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12858 ; AVX2-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
12859 ; AVX2-NEXT: vbroadcastss 220(%rdi), %ymm0
12860 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12861 ; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
12862 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12863 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
12864 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12865 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1
12866 ; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
12867 ; AVX2-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7]
12868 ; AVX2-NEXT: vextractf128 $1, %ymm15, %xmm15
12869 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
12870 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12871 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12872 ; AVX2-NEXT: vbroadcastss 476(%rdi), %ymm0
12873 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12874 ; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
12875 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12876 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
12877 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12878 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1
12879 ; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
12880 ; AVX2-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7]
12881 ; AVX2-NEXT: vextractf128 $1, %ymm15, %xmm15
12882 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
12883 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12884 ; AVX2-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12885 ; AVX2-NEXT: vbroadcastss 732(%rdi), %ymm0
12886 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12887 ; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
12888 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12889 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
12890 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12891 ; AVX2-NEXT: vextractf128 $1, %ymm1, %xmm1
12892 ; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
12893 ; AVX2-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7]
12894 ; AVX2-NEXT: vextractf128 $1, %ymm15, %xmm15
12895 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
12896 ; AVX2-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12897 ; AVX2-NEXT: vbroadcastss 988(%rdi), %ymm0
12898 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12899 ; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
12900 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
12901 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
12902 ; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm1
12903 ; AVX2-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
12904 ; AVX2-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7]
12905 ; AVX2-NEXT: vextractf128 $1, %ymm14, %xmm14
12906 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
12907 ; AVX2-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12908 ; AVX2-NEXT: vbroadcastss 1244(%rdi), %ymm0
12909 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12910 ; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
12911 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3]
12912 ; AVX2-NEXT: vextractf128 $1, %ymm11, %xmm1
12913 ; AVX2-NEXT: vshufps {{.*#+}} ymm11 = ymm13[2,3,2,3,6,7,6,7]
12914 ; AVX2-NEXT: vextractf128 $1, %ymm11, %xmm11
12915 ; AVX2-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3]
12916 ; AVX2-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
12917 ; AVX2-NEXT: vbroadcastss 1500(%rdi), %ymm0
12918 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12919 ; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
12920 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3]
12921 ; AVX2-NEXT: vextractf128 $1, %ymm8, %xmm8
12922 ; AVX2-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
12923 ; AVX2-NEXT: vextractf128 $1, %ymm9, %xmm9
12924 ; AVX2-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
12925 ; AVX2-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7]
12926 ; AVX2-NEXT: vbroadcastss 1756(%rdi), %ymm0
12927 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12928 ; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
12929 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3]
12930 ; AVX2-NEXT: vextractf128 $1, %ymm5, %xmm5
12931 ; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,2,3,6,7,6,7]
12932 ; AVX2-NEXT: vextractf128 $1, %ymm6, %xmm6
12933 ; AVX2-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
12934 ; AVX2-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
12935 ; AVX2-NEXT: vbroadcastss 2012(%rdi), %ymm0
12936 ; AVX2-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
12937 ; AVX2-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
12938 ; AVX2-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3]
12939 ; AVX2-NEXT: vextractf128 $1, %ymm2, %xmm2
12940 ; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7]
12941 ; AVX2-NEXT: vextractf128 $1, %ymm3, %xmm3
12942 ; AVX2-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
12943 ; AVX2-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
12944 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12945 ; AVX2-NEXT: vmovaps %ymm2, 192(%rsi)
12946 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12947 ; AVX2-NEXT: vmovaps %ymm2, 128(%rsi)
12948 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12949 ; AVX2-NEXT: vmovaps %ymm2, 64(%rsi)
12950 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12951 ; AVX2-NEXT: vmovaps %ymm2, (%rsi)
12952 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12953 ; AVX2-NEXT: vmovaps %ymm2, 224(%rsi)
12954 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12955 ; AVX2-NEXT: vmovaps %ymm2, 160(%rsi)
12956 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12957 ; AVX2-NEXT: vmovaps %ymm2, 96(%rsi)
12958 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12959 ; AVX2-NEXT: vmovaps %ymm2, 32(%rsi)
12960 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12961 ; AVX2-NEXT: vmovaps %ymm2, 192(%rdx)
12962 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12963 ; AVX2-NEXT: vmovaps %ymm2, 128(%rdx)
12964 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12965 ; AVX2-NEXT: vmovaps %ymm2, 64(%rdx)
12966 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12967 ; AVX2-NEXT: vmovaps %ymm2, (%rdx)
12968 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12969 ; AVX2-NEXT: vmovaps %ymm2, 224(%rdx)
12970 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12971 ; AVX2-NEXT: vmovaps %ymm2, 160(%rdx)
12972 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12973 ; AVX2-NEXT: vmovaps %ymm2, 96(%rdx)
12974 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12975 ; AVX2-NEXT: vmovaps %ymm2, 32(%rdx)
12976 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12977 ; AVX2-NEXT: vmovaps %ymm2, 192(%rcx)
12978 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12979 ; AVX2-NEXT: vmovaps %ymm2, 128(%rcx)
12980 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12981 ; AVX2-NEXT: vmovaps %ymm2, 64(%rcx)
12982 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12983 ; AVX2-NEXT: vmovaps %ymm2, (%rcx)
12984 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12985 ; AVX2-NEXT: vmovaps %ymm2, 224(%rcx)
12986 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12987 ; AVX2-NEXT: vmovaps %ymm2, 160(%rcx)
12988 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12989 ; AVX2-NEXT: vmovaps %ymm2, 96(%rcx)
12990 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12991 ; AVX2-NEXT: vmovaps %ymm2, 32(%rcx)
12992 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12993 ; AVX2-NEXT: vmovaps %ymm2, 192(%r8)
12994 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12995 ; AVX2-NEXT: vmovaps %ymm2, 128(%r8)
12996 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12997 ; AVX2-NEXT: vmovaps %ymm2, 64(%r8)
12998 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
12999 ; AVX2-NEXT: vmovaps %ymm2, (%r8)
13000 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13001 ; AVX2-NEXT: vmovaps %ymm2, 224(%r8)
13002 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13003 ; AVX2-NEXT: vmovaps %ymm2, 160(%r8)
13004 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13005 ; AVX2-NEXT: vmovaps %ymm2, 96(%r8)
13006 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13007 ; AVX2-NEXT: vmovaps %ymm2, 32(%r8)
13008 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13009 ; AVX2-NEXT: vmovaps %ymm2, 224(%r9)
13010 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13011 ; AVX2-NEXT: vmovaps %ymm2, 192(%r9)
13012 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13013 ; AVX2-NEXT: vmovaps %ymm2, 160(%r9)
13014 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13015 ; AVX2-NEXT: vmovaps %ymm2, 128(%r9)
13016 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13017 ; AVX2-NEXT: vmovaps %ymm2, 96(%r9)
13018 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13019 ; AVX2-NEXT: vmovaps %ymm2, 64(%r9)
13020 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13021 ; AVX2-NEXT: vmovaps %ymm2, 32(%r9)
13022 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13023 ; AVX2-NEXT: vmovaps %ymm2, (%r9)
13024 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
13025 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13026 ; AVX2-NEXT: vmovaps %ymm2, 224(%rax)
13027 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13028 ; AVX2-NEXT: vmovaps %ymm2, 192(%rax)
13029 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13030 ; AVX2-NEXT: vmovaps %ymm2, 160(%rax)
13031 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13032 ; AVX2-NEXT: vmovaps %ymm2, 128(%rax)
13033 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13034 ; AVX2-NEXT: vmovaps %ymm2, 96(%rax)
13035 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13036 ; AVX2-NEXT: vmovaps %ymm2, 64(%rax)
13037 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13038 ; AVX2-NEXT: vmovaps %ymm2, 32(%rax)
13039 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13040 ; AVX2-NEXT: vmovaps %ymm2, (%rax)
13041 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
13042 ; AVX2-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
13043 ; AVX2-NEXT: vmovaps %ymm2, 224(%rax)
13044 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13045 ; AVX2-NEXT: vmovaps %ymm2, 192(%rax)
13046 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13047 ; AVX2-NEXT: vmovaps %ymm2, 160(%rax)
13048 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13049 ; AVX2-NEXT: vmovaps %ymm2, 128(%rax)
13050 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13051 ; AVX2-NEXT: vmovaps %ymm2, 96(%rax)
13052 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13053 ; AVX2-NEXT: vmovaps %ymm2, 64(%rax)
13054 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13055 ; AVX2-NEXT: vmovaps %ymm2, 32(%rax)
13056 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
13057 ; AVX2-NEXT: vmovaps %ymm2, (%rax)
13058 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
13059 ; AVX2-NEXT: vmovaps %ymm0, 224(%rax)
13060 ; AVX2-NEXT: vmovaps %ymm5, 192(%rax)
13061 ; AVX2-NEXT: vmovaps %ymm8, 160(%rax)
13062 ; AVX2-NEXT: vmovaps %ymm1, 128(%rax)
13063 ; AVX2-NEXT: vmovaps %ymm14, 96(%rax)
13064 ; AVX2-NEXT: vmovaps %ymm15, 64(%rax)
13065 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13066 ; AVX2-NEXT: vmovaps %ymm0, 32(%rax)
13067 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
13068 ; AVX2-NEXT: vmovaps %ymm0, (%rax)
13069 ; AVX2-NEXT: addq $3528, %rsp # imm = 0xDC8
13070 ; AVX2-NEXT: vzeroupper
13073 ; AVX2-FP-LABEL: load_i32_stride8_vf64:
13074 ; AVX2-FP: # %bb.0:
13075 ; AVX2-FP-NEXT: subq $3528, %rsp # imm = 0xDC8
13076 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %xmm10
13077 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %xmm0
13078 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13079 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
13080 ; AVX2-FP-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13081 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %xmm2
13082 ; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm1
13083 ; AVX2-FP-NEXT: vmovaps %xmm2, %xmm9
13084 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %xmm2
13085 ; AVX2-FP-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill
13086 ; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm2
13087 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13088 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13089 ; AVX2-FP-NEXT: vmovaps 416(%rdi), %xmm1
13090 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13091 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %xmm2
13092 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13093 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13094 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13095 ; AVX2-FP-NEXT: vmovaps 480(%rdi), %xmm3
13096 ; AVX2-FP-NEXT: vbroadcastss %xmm3, %xmm2
13097 ; AVX2-FP-NEXT: vmovaps %xmm3, %xmm13
13098 ; AVX2-FP-NEXT: vmovaps 448(%rdi), %xmm3
13099 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13100 ; AVX2-FP-NEXT: vbroadcastss %xmm3, %xmm3
13101 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
13102 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
13103 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
13104 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13105 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13106 ; AVX2-FP-NEXT: vmovaps 800(%rdi), %xmm0
13107 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13108 ; AVX2-FP-NEXT: vmovaps 768(%rdi), %xmm1
13109 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13110 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
13111 ; AVX2-FP-NEXT: vmovaps 864(%rdi), %xmm12
13112 ; AVX2-FP-NEXT: vbroadcastss %xmm12, %xmm1
13113 ; AVX2-FP-NEXT: vmovaps 832(%rdi), %xmm2
13114 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13115 ; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm2
13116 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13117 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13118 ; AVX2-FP-NEXT: vmovaps 992(%rdi), %xmm1
13119 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13120 ; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1
13121 ; AVX2-FP-NEXT: vmovaps 960(%rdi), %xmm2
13122 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13123 ; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm2
13124 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13125 ; AVX2-FP-NEXT: vmovaps 928(%rdi), %xmm2
13126 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13127 ; AVX2-FP-NEXT: vmovaps 896(%rdi), %xmm3
13128 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13129 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
13130 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
13131 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13132 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13133 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13134 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13135 ; AVX2-FP-NEXT: vmovaps 1376(%rdi), %xmm0
13136 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13137 ; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm0
13138 ; AVX2-FP-NEXT: vmovaps 1344(%rdi), %xmm1
13139 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13140 ; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1
13141 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
13142 ; AVX2-FP-NEXT: vmovaps 1312(%rdi), %xmm1
13143 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13144 ; AVX2-FP-NEXT: vmovaps 1280(%rdi), %xmm2
13145 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13146 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13147 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
13148 ; AVX2-FP-NEXT: vmovaps 1504(%rdi), %xmm1
13149 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13150 ; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1
13151 ; AVX2-FP-NEXT: vmovaps 1472(%rdi), %xmm2
13152 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13153 ; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm2
13154 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13155 ; AVX2-FP-NEXT: vmovaps 1440(%rdi), %xmm2
13156 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13157 ; AVX2-FP-NEXT: vmovaps 1408(%rdi), %xmm3
13158 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13159 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
13160 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
13161 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13162 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13163 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13164 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13165 ; AVX2-FP-NEXT: vmovaps 1888(%rdi), %xmm0
13166 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13167 ; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm0
13168 ; AVX2-FP-NEXT: vmovaps 1856(%rdi), %xmm1
13169 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13170 ; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1
13171 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
13172 ; AVX2-FP-NEXT: vmovaps 1824(%rdi), %xmm1
13173 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13174 ; AVX2-FP-NEXT: vmovaps 1792(%rdi), %xmm2
13175 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13176 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13177 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
13178 ; AVX2-FP-NEXT: vmovaps 2016(%rdi), %xmm0
13179 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13180 ; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm1
13181 ; AVX2-FP-NEXT: vmovaps 1984(%rdi), %xmm0
13182 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13183 ; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm2
13184 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13185 ; AVX2-FP-NEXT: vmovaps 1952(%rdi), %xmm0
13186 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13187 ; AVX2-FP-NEXT: vmovaps 1920(%rdi), %xmm2
13188 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13189 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
13190 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
13191 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13192 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13193 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7]
13194 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13195 ; AVX2-FP-NEXT: vmovaps 608(%rdi), %xmm0
13196 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13197 ; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm0
13198 ; AVX2-FP-NEXT: vmovaps 576(%rdi), %xmm1
13199 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13200 ; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1
13201 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
13202 ; AVX2-FP-NEXT: vmovaps 544(%rdi), %xmm2
13203 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13204 ; AVX2-FP-NEXT: vmovaps 512(%rdi), %xmm1
13205 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13206 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
13207 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
13208 ; AVX2-FP-NEXT: vmovaps 736(%rdi), %xmm1
13209 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13210 ; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1
13211 ; AVX2-FP-NEXT: vmovaps 704(%rdi), %xmm2
13212 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13213 ; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm2
13214 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13215 ; AVX2-FP-NEXT: vmovaps 672(%rdi), %xmm3
13216 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13217 ; AVX2-FP-NEXT: vmovaps 640(%rdi), %xmm2
13218 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13219 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
13220 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
13221 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13222 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13223 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13224 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13225 ; AVX2-FP-NEXT: vmovaps 1120(%rdi), %xmm0
13226 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13227 ; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm0
13228 ; AVX2-FP-NEXT: vmovaps 1088(%rdi), %xmm1
13229 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13230 ; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1
13231 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
13232 ; AVX2-FP-NEXT: vmovaps 1056(%rdi), %xmm2
13233 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13234 ; AVX2-FP-NEXT: vmovaps 1024(%rdi), %xmm1
13235 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13236 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
13237 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
13238 ; AVX2-FP-NEXT: vmovaps 1248(%rdi), %xmm1
13239 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13240 ; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1
13241 ; AVX2-FP-NEXT: vmovaps 1216(%rdi), %xmm2
13242 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13243 ; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm2
13244 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13245 ; AVX2-FP-NEXT: vmovaps 1184(%rdi), %xmm3
13246 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13247 ; AVX2-FP-NEXT: vmovaps 1152(%rdi), %xmm2
13248 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13249 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
13250 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
13251 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13252 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13253 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13254 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13255 ; AVX2-FP-NEXT: vmovaps 1632(%rdi), %xmm0
13256 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13257 ; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm0
13258 ; AVX2-FP-NEXT: vmovaps 1600(%rdi), %xmm1
13259 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13260 ; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1
13261 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
13262 ; AVX2-FP-NEXT: vmovaps 1568(%rdi), %xmm2
13263 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13264 ; AVX2-FP-NEXT: vmovaps 1536(%rdi), %xmm1
13265 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13266 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
13267 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
13268 ; AVX2-FP-NEXT: vmovaps 1760(%rdi), %xmm1
13269 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13270 ; AVX2-FP-NEXT: vbroadcastss %xmm1, %xmm1
13271 ; AVX2-FP-NEXT: vmovaps 1728(%rdi), %xmm2
13272 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13273 ; AVX2-FP-NEXT: vbroadcastss %xmm2, %xmm2
13274 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13275 ; AVX2-FP-NEXT: vmovaps 1696(%rdi), %xmm3
13276 ; AVX2-FP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13277 ; AVX2-FP-NEXT: vmovaps 1664(%rdi), %xmm2
13278 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13279 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
13280 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
13281 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13282 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13283 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13284 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13285 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %xmm0
13286 ; AVX2-FP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13287 ; AVX2-FP-NEXT: vbroadcastss %xmm0, %xmm0
13288 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %xmm11
13289 ; AVX2-FP-NEXT: vbroadcastss %xmm11, %xmm1
13290 ; AVX2-FP-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13291 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
13292 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %xmm2
13293 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13294 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %xmm1
13295 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13296 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
13297 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13298 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
13299 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13300 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %xmm8
13301 ; AVX2-FP-NEXT: vbroadcastss %xmm8, %xmm1
13302 ; AVX2-FP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13303 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %xmm7
13304 ; AVX2-FP-NEXT: vbroadcastss %xmm7, %xmm2
13305 ; AVX2-FP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13306 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13307 ; AVX2-FP-NEXT: vmovaps (%rdi), %xmm5
13308 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %xmm6
13309 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
13310 ; AVX2-FP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13311 ; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13312 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3]
13313 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7]
13314 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13315 ; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
13316 ; AVX2-FP-NEXT: # xmm0 = mem[1,1,1,1]
13317 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3]
13318 ; AVX2-FP-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13319 ; AVX2-FP-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
13320 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
13321 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13322 ; AVX2-FP-NEXT: vmovaps %xmm13, %xmm9
13323 ; AVX2-FP-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13324 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
13325 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
13326 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13327 ; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
13328 ; AVX2-FP-NEXT: # xmm2 = mem[1,1,1,1]
13329 ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
13330 ; AVX2-FP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
13331 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
13332 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13333 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13334 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13335 ; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
13336 ; AVX2-FP-NEXT: # xmm0 = mem[1,1,1,1]
13337 ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13338 ; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
13339 ; AVX2-FP-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13340 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13341 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1]
13342 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13343 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
13344 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
13345 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
13346 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13347 ; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
13348 ; AVX2-FP-NEXT: # xmm2 = mem[1,1,1,1]
13349 ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
13350 ; AVX2-FP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
13351 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
13352 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13353 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13354 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13355 ; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
13356 ; AVX2-FP-NEXT: # xmm0 = mem[1,1,1,1]
13357 ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13358 ; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
13359 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13360 ; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13361 ; AVX2-FP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
13362 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13363 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
13364 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
13365 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
13366 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13367 ; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
13368 ; AVX2-FP-NEXT: # xmm2 = mem[1,1,1,1]
13369 ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
13370 ; AVX2-FP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
13371 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
13372 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13373 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13374 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13375 ; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
13376 ; AVX2-FP-NEXT: # xmm0 = mem[1,1,1,1]
13377 ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13378 ; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
13379 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13380 ; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13381 ; AVX2-FP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
13382 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13383 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13384 ; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13385 ; AVX2-FP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
13386 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13387 ; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
13388 ; AVX2-FP-NEXT: # xmm2 = mem[1,1,1,1]
13389 ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
13390 ; AVX2-FP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
13391 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
13392 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13393 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13394 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13395 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1]
13396 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3]
13397 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
13398 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13399 ; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
13400 ; AVX2-FP-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1]
13401 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13402 ; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
13403 ; AVX2-FP-NEXT: # xmm2 = mem[1,1,1,1]
13404 ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
13405 ; AVX2-FP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
13406 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
13407 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13408 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13409 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13410 ; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
13411 ; AVX2-FP-NEXT: # xmm0 = mem[1,1,1,1]
13412 ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13413 ; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
13414 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13415 ; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13416 ; AVX2-FP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
13417 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13418 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
13419 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
13420 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
13421 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13422 ; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
13423 ; AVX2-FP-NEXT: # xmm2 = mem[1,1,1,1]
13424 ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
13425 ; AVX2-FP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
13426 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
13427 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13428 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13429 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13430 ; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
13431 ; AVX2-FP-NEXT: # xmm0 = mem[1,1,1,1]
13432 ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13433 ; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
13434 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13435 ; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13436 ; AVX2-FP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
13437 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13438 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13439 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
13440 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm5[0],xmm11[1],xmm5[1]
13441 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13442 ; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
13443 ; AVX2-FP-NEXT: # xmm2 = mem[1,1,1,1]
13444 ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
13445 ; AVX2-FP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
13446 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
13447 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13448 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13449 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13450 ; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
13451 ; AVX2-FP-NEXT: # xmm0 = mem[1,1,1,1]
13452 ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13453 ; AVX2-FP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
13454 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13455 ; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13456 ; AVX2-FP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
13457 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13458 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13459 ; AVX2-FP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13460 ; AVX2-FP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
13461 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13462 ; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
13463 ; AVX2-FP-NEXT: # xmm2 = mem[1,1,1,1]
13464 ; AVX2-FP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
13465 ; AVX2-FP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
13466 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
13467 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13468 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13469 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13470 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13471 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
13472 ; AVX2-FP-NEXT: # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3]
13473 ; AVX2-FP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13474 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13475 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
13476 ; AVX2-FP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
13477 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13478 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm9[2,2,2,2]
13479 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
13480 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
13481 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13482 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13483 ; AVX2-FP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
13484 ; AVX2-FP-NEXT: # xmm1 = mem[2,2,2,2]
13485 ; AVX2-FP-NEXT: vblendps $7, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
13486 ; AVX2-FP-NEXT: # xmm1 = mem[0,1,2],xmm1[3]
13487 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
13488 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13489 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13490 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13491 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
13492 ; AVX2-FP-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
13493 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13494 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13495 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
13496 ; AVX2-FP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
13497 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13498 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm12[2,2,2,2]
13499 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3]
13500 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
13501 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13502 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13503 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
13504 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,2,2,2]
13505 ; AVX2-FP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13506 ; AVX2-FP-NEXT: # xmm1 = mem[0,1,2],xmm1[3]
13507 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
13508 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13509 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13510 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13511 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
13512 ; AVX2-FP-NEXT: # xmm10 = xmm0[2],mem[2],xmm0[3],mem[3]
13513 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13514 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
13515 ; AVX2-FP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
13516 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13517 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2]
13518 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
13519 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
13520 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13521 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13522 ; AVX2-FP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
13523 ; AVX2-FP-NEXT: # xmm1 = mem[2,2,2,2]
13524 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
13525 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3]
13526 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
13527 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13528 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13529 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13530 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
13531 ; AVX2-FP-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3]
13532 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13533 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
13534 ; AVX2-FP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
13535 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13536 ; AVX2-FP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
13537 ; AVX2-FP-NEXT: # xmm0 = mem[2,2,2,2]
13538 ; AVX2-FP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13539 ; AVX2-FP-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
13540 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
13541 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13542 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13543 ; AVX2-FP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
13544 ; AVX2-FP-NEXT: # xmm1 = mem[2,2,2,2]
13545 ; AVX2-FP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13546 ; AVX2-FP-NEXT: # xmm1 = mem[0,1,2],xmm1[3]
13547 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
13548 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13549 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13550 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13551 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
13552 ; AVX2-FP-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
13553 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13554 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13555 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
13556 ; AVX2-FP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
13557 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13558 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,2,2,2]
13559 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3]
13560 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
13561 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13562 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13563 ; AVX2-FP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
13564 ; AVX2-FP-NEXT: # xmm1 = mem[2,2,2,2]
13565 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
13566 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3]
13567 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
13568 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13569 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13570 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13571 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
13572 ; AVX2-FP-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
13573 ; AVX2-FP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13574 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13575 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
13576 ; AVX2-FP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
13577 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13578 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2]
13579 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
13580 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
13581 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13582 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13583 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
13584 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2]
13585 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
13586 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
13587 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
13588 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13589 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13590 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13591 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
13592 ; AVX2-FP-NEXT: # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3]
13593 ; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13594 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13595 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
13596 ; AVX2-FP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
13597 ; AVX2-FP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13598 ; AVX2-FP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
13599 ; AVX2-FP-NEXT: # xmm0 = mem[2,2,2,2]
13600 ; AVX2-FP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13601 ; AVX2-FP-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
13602 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
13603 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm15
13604 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7]
13605 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
13606 ; AVX2-FP-NEXT: vshufps {{.*#+}} xmm15 = xmm2[2,2,2,2]
13607 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13608 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3]
13609 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm5[0,1],xmm15[2,3]
13610 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
13611 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13612 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13613 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
13614 ; AVX2-FP-NEXT: # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3]
13615 ; AVX2-FP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13616 ; AVX2-FP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
13617 ; AVX2-FP-NEXT: # xmm15 = mem[2,2,2,2]
13618 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13619 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1,2],xmm15[3]
13620 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
13621 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm14
13622 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
13623 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13624 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload
13625 ; AVX2-FP-NEXT: # xmm15 = xmm5[2],mem[2],xmm5[3],mem[3]
13626 ; AVX2-FP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
13627 ; AVX2-FP-NEXT: # xmm13 = mem[2,2,2,2]
13628 ; AVX2-FP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
13629 ; AVX2-FP-NEXT: # xmm13 = mem[0,1,2],xmm13[3]
13630 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
13631 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm14[4,5,6,7]
13632 ; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13633 ; AVX2-FP-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload
13634 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload
13635 ; AVX2-FP-NEXT: # xmm13 = xmm5[2],mem[2],xmm5[3],mem[3]
13636 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13637 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm5[1],xmm13[1]
13638 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13639 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload
13640 ; AVX2-FP-NEXT: # xmm14 = xmm5[2],mem[2],xmm5[3],mem[3]
13641 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
13642 ; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
13643 ; AVX2-FP-NEXT: # xmm12 = mem[2,3,2,3]
13644 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
13645 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7]
13646 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
13647 ; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13648 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13649 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm9[2],xmm5[3],xmm9[3]
13650 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13651 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm5[1],xmm12[1]
13652 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13653 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload
13654 ; AVX2-FP-NEXT: # xmm12 = xmm5[2],mem[2],xmm5[3],mem[3]
13655 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
13656 ; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
13657 ; AVX2-FP-NEXT: # xmm13 = mem[2,3,2,3]
13658 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13
13659 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
13660 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
13661 ; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13662 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm11 # 16-byte Folded Reload
13663 ; AVX2-FP-NEXT: # xmm11 = xmm7[2],mem[2],xmm7[3],mem[3]
13664 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm10[1],xmm11[1]
13665 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13666 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm11 # 16-byte Folded Reload
13667 ; AVX2-FP-NEXT: # xmm11 = xmm5[2],mem[2],xmm5[3],mem[3]
13668 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
13669 ; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
13670 ; AVX2-FP-NEXT: # xmm10 = mem[2,3,2,3]
13671 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
13672 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
13673 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
13674 ; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13675 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13676 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload
13677 ; AVX2-FP-NEXT: # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3]
13678 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm9[1]
13679 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13680 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload
13681 ; AVX2-FP-NEXT: # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3]
13682 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
13683 ; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
13684 ; AVX2-FP-NEXT: # xmm8 = mem[2,3,2,3]
13685 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
13686 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
13687 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
13688 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13689 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload
13690 ; AVX2-FP-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3]
13691 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
13692 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1]
13693 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
13694 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload
13695 ; AVX2-FP-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3]
13696 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
13697 ; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
13698 ; AVX2-FP-NEXT: # xmm6 = mem[2,3,2,3]
13699 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
13700 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
13701 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
13702 ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13703 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm5 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
13704 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
13705 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1]
13706 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
13707 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload
13708 ; AVX2-FP-NEXT: # xmm5 = xmm4[2],mem[2],xmm4[3],mem[3]
13709 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
13710 ; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
13711 ; AVX2-FP-NEXT: # xmm4 = mem[2,3,2,3]
13712 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
13713 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
13714 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
13715 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13716 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
13717 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13718 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
13719 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
13720 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
13721 ; AVX2-FP-NEXT: # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3]
13722 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
13723 ; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
13724 ; AVX2-FP-NEXT: # xmm2 = mem[2,3,2,3]
13725 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
13726 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
13727 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
13728 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13729 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
13730 ; AVX2-FP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
13731 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
13732 ; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
13733 ; AVX2-FP-NEXT: # xmm0 = mem[2,3,2,3]
13734 ; AVX2-FP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
13735 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
13736 ; AVX2-FP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13737 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13738 ; AVX2-FP-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
13739 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1]
13740 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
13741 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13742 ; AVX2-FP-NEXT: vmovaps 32(%rdi), %ymm0
13743 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13744 ; AVX2-FP-NEXT: vmovaps (%rdi), %ymm1
13745 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13746 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
13747 ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0
13748 ; AVX2-FP-NEXT: vmovaps 96(%rdi), %ymm1
13749 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13750 ; AVX2-FP-NEXT: vmovaps 64(%rdi), %ymm2
13751 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13752 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
13753 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13754 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
13755 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13756 ; AVX2-FP-NEXT: vmovaps 224(%rdi), %ymm2
13757 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13758 ; AVX2-FP-NEXT: vmovaps 192(%rdi), %ymm3
13759 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13760 ; AVX2-FP-NEXT: vmovaps 160(%rdi), %ymm15
13761 ; AVX2-FP-NEXT: vmovaps 128(%rdi), %ymm1
13762 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13763 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5]
13764 ; AVX2-FP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13765 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
13766 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13767 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
13768 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13769 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13770 ; AVX2-FP-NEXT: vmovaps 288(%rdi), %ymm0
13771 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13772 ; AVX2-FP-NEXT: vmovaps 256(%rdi), %ymm1
13773 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13774 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
13775 ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0
13776 ; AVX2-FP-NEXT: vmovaps 352(%rdi), %ymm1
13777 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13778 ; AVX2-FP-NEXT: vmovaps 320(%rdi), %ymm2
13779 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13780 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
13781 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13782 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
13783 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13784 ; AVX2-FP-NEXT: vmovaps 480(%rdi), %ymm2
13785 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13786 ; AVX2-FP-NEXT: vmovaps 448(%rdi), %ymm3
13787 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13788 ; AVX2-FP-NEXT: vmovaps 416(%rdi), %ymm4
13789 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13790 ; AVX2-FP-NEXT: vmovaps 384(%rdi), %ymm1
13791 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13792 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5]
13793 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
13794 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2]
13795 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13796 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13797 ; AVX2-FP-NEXT: vmovaps 544(%rdi), %ymm0
13798 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13799 ; AVX2-FP-NEXT: vmovaps 512(%rdi), %ymm1
13800 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13801 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
13802 ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0
13803 ; AVX2-FP-NEXT: vmovaps 608(%rdi), %ymm1
13804 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13805 ; AVX2-FP-NEXT: vmovaps 576(%rdi), %ymm2
13806 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13807 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
13808 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13809 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
13810 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13811 ; AVX2-FP-NEXT: vmovaps 736(%rdi), %ymm2
13812 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13813 ; AVX2-FP-NEXT: vmovaps 704(%rdi), %ymm3
13814 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13815 ; AVX2-FP-NEXT: vmovaps 672(%rdi), %ymm4
13816 ; AVX2-FP-NEXT: vmovaps 640(%rdi), %ymm1
13817 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13818 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5]
13819 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13820 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
13821 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2]
13822 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13823 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13824 ; AVX2-FP-NEXT: vmovaps 800(%rdi), %ymm0
13825 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13826 ; AVX2-FP-NEXT: vmovaps 768(%rdi), %ymm1
13827 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13828 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
13829 ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0
13830 ; AVX2-FP-NEXT: vmovaps 864(%rdi), %ymm1
13831 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13832 ; AVX2-FP-NEXT: vmovaps 832(%rdi), %ymm2
13833 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13834 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
13835 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13836 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
13837 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13838 ; AVX2-FP-NEXT: vmovaps 992(%rdi), %ymm2
13839 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13840 ; AVX2-FP-NEXT: vmovaps 960(%rdi), %ymm5
13841 ; AVX2-FP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13842 ; AVX2-FP-NEXT: vmovaps 928(%rdi), %ymm3
13843 ; AVX2-FP-NEXT: vmovaps 896(%rdi), %ymm1
13844 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13845 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
13846 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13847 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5]
13848 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2]
13849 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
13850 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13851 ; AVX2-FP-NEXT: vmovaps 1056(%rdi), %ymm0
13852 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13853 ; AVX2-FP-NEXT: vmovaps 1024(%rdi), %ymm1
13854 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13855 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
13856 ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0
13857 ; AVX2-FP-NEXT: vmovaps 1120(%rdi), %ymm1
13858 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13859 ; AVX2-FP-NEXT: vmovaps 1088(%rdi), %ymm2
13860 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13861 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
13862 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13863 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
13864 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13865 ; AVX2-FP-NEXT: vmovaps 1248(%rdi), %ymm1
13866 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13867 ; AVX2-FP-NEXT: vmovaps 1216(%rdi), %ymm7
13868 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13869 ; AVX2-FP-NEXT: vmovaps 1184(%rdi), %ymm2
13870 ; AVX2-FP-NEXT: vmovaps 1152(%rdi), %ymm6
13871 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13872 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5]
13873 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13874 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[1],ymm1[1],ymm7[4],ymm1[4],ymm7[5],ymm1[5]
13875 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
13876 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
13877 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13878 ; AVX2-FP-NEXT: vmovaps 1312(%rdi), %ymm0
13879 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13880 ; AVX2-FP-NEXT: vmovaps 1280(%rdi), %ymm6
13881 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13882 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5]
13883 ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm6
13884 ; AVX2-FP-NEXT: vmovaps 1376(%rdi), %ymm0
13885 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13886 ; AVX2-FP-NEXT: vmovaps 1344(%rdi), %ymm7
13887 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13888 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[4],ymm0[4],ymm7[5],ymm0[5]
13889 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13890 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[2,2,2,2]
13891 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3]
13892 ; AVX2-FP-NEXT: vmovaps 1504(%rdi), %ymm6
13893 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13894 ; AVX2-FP-NEXT: vmovaps 1472(%rdi), %ymm10
13895 ; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13896 ; AVX2-FP-NEXT: vmovaps 1440(%rdi), %ymm0
13897 ; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
13898 ; AVX2-FP-NEXT: vmovaps 1408(%rdi), %ymm9
13899 ; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13900 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5]
13901 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5]
13902 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[2],ymm0[2]
13903 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
13904 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13905 ; AVX2-FP-NEXT: vmovaps 1568(%rdi), %ymm6
13906 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13907 ; AVX2-FP-NEXT: vmovaps 1536(%rdi), %ymm7
13908 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13909 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm7 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
13910 ; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm9
13911 ; AVX2-FP-NEXT: vmovaps 1632(%rdi), %ymm6
13912 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13913 ; AVX2-FP-NEXT: vmovaps 1600(%rdi), %ymm7
13914 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13915 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
13916 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13917 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,2,2,2]
13918 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0,1],xmm10[2,3]
13919 ; AVX2-FP-NEXT: vmovaps 1760(%rdi), %ymm9
13920 ; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13921 ; AVX2-FP-NEXT: vmovaps 1728(%rdi), %ymm6
13922 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13923 ; AVX2-FP-NEXT: vmovaps 1696(%rdi), %ymm7
13924 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13925 ; AVX2-FP-NEXT: vmovaps 1664(%rdi), %ymm11
13926 ; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13927 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[4],ymm7[4],ymm11[5],ymm7[5]
13928 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5]
13929 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13930 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[2],ymm6[2]
13931 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
13932 ; AVX2-FP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13933 ; AVX2-FP-NEXT: vmovaps 1824(%rdi), %ymm6
13934 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13935 ; AVX2-FP-NEXT: vmovaps 1792(%rdi), %ymm7
13936 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13937 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
13938 ; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm11
13939 ; AVX2-FP-NEXT: vmovaps 1888(%rdi), %ymm6
13940 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13941 ; AVX2-FP-NEXT: vmovaps 1856(%rdi), %ymm7
13942 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13943 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
13944 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13945 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[2,2,2,2]
13946 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm11[0,1],xmm13[2,3]
13947 ; AVX2-FP-NEXT: vmovaps 2016(%rdi), %ymm11
13948 ; AVX2-FP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13949 ; AVX2-FP-NEXT: vmovaps 1984(%rdi), %ymm6
13950 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13951 ; AVX2-FP-NEXT: vmovaps 1952(%rdi), %ymm7
13952 ; AVX2-FP-NEXT: vmovaps 1920(%rdi), %ymm9
13953 ; AVX2-FP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13954 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5]
13955 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13956 ; AVX2-FP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm11[0],ymm6[1],ymm11[1],ymm6[4],ymm11[4],ymm6[5],ymm11[5]
13957 ; AVX2-FP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm11[0],ymm14[2],ymm11[2]
13958 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
13959 ; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13960 ; AVX2-FP-NEXT: vbroadcastss 148(%rdi), %ymm13
13961 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
13962 ; AVX2-FP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
13963 ; AVX2-FP-NEXT: # ymm13 = ymm13[0,1,2,3,4,5],mem[6,7]
13964 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
13965 ; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm14
13966 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
13967 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5]
13968 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
13969 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm6[1],ymm15[2,3,4],ymm6[5],ymm15[6,7]
13970 ; AVX2-FP-NEXT: vextractf128 $1, %ymm15, %xmm15
13971 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
13972 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
13973 ; AVX2-FP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13974 ; AVX2-FP-NEXT: vbroadcastss 404(%rdi), %ymm13
13975 ; AVX2-FP-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
13976 ; AVX2-FP-NEXT: # ymm13 = ymm13[0,1,2,3,4],mem[5],ymm13[6,7]
13977 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
13978 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
13979 ; AVX2-FP-NEXT: vextractf128 $1, %ymm10, %xmm13
13980 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
13981 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,1,1,1,5,5,5,5]
13982 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
13983 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7]
13984 ; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm14
13985 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
13986 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
13987 ; AVX2-FP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13988 ; AVX2-FP-NEXT: vbroadcastss 660(%rdi), %ymm12
13989 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm4[5],ymm12[6,7]
13990 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm8[6,7]
13991 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
13992 ; AVX2-FP-NEXT: vextractf128 $1, %ymm8, %xmm8
13993 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
13994 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,1,1,1,5,5,5,5]
13995 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
13996 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7]
13997 ; AVX2-FP-NEXT: vextractf128 $1, %ymm12, %xmm12
13998 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3]
13999 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
14000 ; AVX2-FP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14001 ; AVX2-FP-NEXT: vbroadcastss 916(%rdi), %ymm4
14002 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
14003 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm5[6,7]
14004 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
14005 ; AVX2-FP-NEXT: vextractf128 $1, %ymm4, %xmm4
14006 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
14007 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5]
14008 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
14009 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4],ymm12[5],ymm5[6,7]
14010 ; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5
14011 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
14012 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
14013 ; AVX2-FP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14014 ; AVX2-FP-NEXT: vbroadcastss 1172(%rdi), %ymm3
14015 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
14016 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
14017 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14018 ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2
14019 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
14020 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1,1,1,5,5,5,5]
14021 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
14022 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7]
14023 ; AVX2-FP-NEXT: vextractf128 $1, %ymm3, %xmm3
14024 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
14025 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
14026 ; AVX2-FP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14027 ; AVX2-FP-NEXT: vbroadcastss 1428(%rdi), %ymm1
14028 ; AVX2-FP-NEXT: vblendps $32, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
14029 ; AVX2-FP-NEXT: # ymm1 = ymm1[0,1,2,3,4],mem[5],ymm1[6,7]
14030 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14031 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14032 ; AVX2-FP-NEXT: vextractf128 $1, %ymm0, %xmm0
14033 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
14034 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5]
14035 ; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
14036 ; AVX2-FP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
14037 ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2
14038 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
14039 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14040 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14041 ; AVX2-FP-NEXT: vbroadcastss 1684(%rdi), %ymm0
14042 ; AVX2-FP-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14043 ; AVX2-FP-NEXT: # ymm0 = ymm0[0,1,2,3,4],mem[5],ymm0[6,7]
14044 ; AVX2-FP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14045 ; AVX2-FP-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
14046 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14047 ; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1
14048 ; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
14049 ; AVX2-FP-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5]
14050 ; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
14051 ; AVX2-FP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
14052 ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2
14053 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
14054 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14055 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14056 ; AVX2-FP-NEXT: vbroadcastss 1940(%rdi), %ymm0
14057 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
14058 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
14059 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14060 ; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1
14061 ; AVX2-FP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
14062 ; AVX2-FP-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5]
14063 ; AVX2-FP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
14064 ; AVX2-FP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
14065 ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2
14066 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
14067 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14068 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14069 ; AVX2-FP-NEXT: vbroadcastss 248(%rdi), %ymm0
14070 ; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14071 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
14072 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14073 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
14074 ; AVX2-FP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14075 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14076 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7]
14077 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14078 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14079 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
14080 ; AVX2-FP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14081 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14082 ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm1
14083 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,2,2,2]
14084 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
14085 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
14086 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14087 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14088 ; AVX2-FP-NEXT: vbroadcastss 504(%rdi), %ymm0
14089 ; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14090 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
14091 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14092 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
14093 ; AVX2-FP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14094 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14095 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[6],ymm10[6],ymm15[7],ymm10[7]
14096 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14097 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14098 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
14099 ; AVX2-FP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14100 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14101 ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm1
14102 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2]
14103 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
14104 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
14105 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14106 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14107 ; AVX2-FP-NEXT: vbroadcastss 760(%rdi), %ymm0
14108 ; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14109 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
14110 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14111 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
14112 ; AVX2-FP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14113 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14114 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7]
14115 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14116 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14117 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
14118 ; AVX2-FP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14119 ; AVX2-FP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14120 ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm1
14121 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2]
14122 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
14123 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
14124 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14125 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14126 ; AVX2-FP-NEXT: vbroadcastss 1016(%rdi), %ymm0
14127 ; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14128 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
14129 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14130 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
14131 ; AVX2-FP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14132 ; AVX2-FP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14133 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[6],ymm12[6],ymm8[7],ymm12[7]
14134 ; AVX2-FP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14135 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14136 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
14137 ; AVX2-FP-NEXT: # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14138 ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm1
14139 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm14[2,2,2,2]
14140 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
14141 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
14142 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14143 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14144 ; AVX2-FP-NEXT: vbroadcastss 1272(%rdi), %ymm0
14145 ; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14146 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
14147 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14148 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload
14149 ; AVX2-FP-NEXT: # ymm12 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14150 ; AVX2-FP-NEXT: vunpckhps {{.*#+}} ymm13 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7]
14151 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14152 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload
14153 ; AVX2-FP-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14154 ; AVX2-FP-NEXT: vextractf128 $1, %ymm13, %xmm1
14155 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm11[2,2,2,2]
14156 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
14157 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
14158 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14159 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14160 ; AVX2-FP-NEXT: vbroadcastss 1528(%rdi), %ymm0
14161 ; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14162 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
14163 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14164 ; AVX2-FP-NEXT: vunpckhps (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload
14165 ; AVX2-FP-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14166 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload
14167 ; AVX2-FP-NEXT: # ymm9 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7]
14168 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14169 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
14170 ; AVX2-FP-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14171 ; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm1
14172 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm8[2,2,2,2]
14173 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
14174 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7]
14175 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14176 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14177 ; AVX2-FP-NEXT: vbroadcastss 1784(%rdi), %ymm0
14178 ; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14179 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
14180 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14181 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
14182 ; AVX2-FP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14183 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14184 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
14185 ; AVX2-FP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14186 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14187 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
14188 ; AVX2-FP-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
14189 ; AVX2-FP-NEXT: vextractf128 $1, %ymm7, %xmm1
14190 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[2,2,2,2]
14191 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
14192 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
14193 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14194 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14195 ; AVX2-FP-NEXT: vbroadcastss 2040(%rdi), %ymm0
14196 ; AVX2-FP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
14197 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7]
14198 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14199 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
14200 ; AVX2-FP-NEXT: # ymm4 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
14201 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14202 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
14203 ; AVX2-FP-NEXT: # ymm3 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
14204 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14205 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
14206 ; AVX2-FP-NEXT: # ymm2 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
14207 ; AVX2-FP-NEXT: vextractf128 $1, %ymm3, %xmm0
14208 ; AVX2-FP-NEXT: vpermpd {{.*#+}} ymm15 = ymm2[2,2,2,2]
14209 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3]
14210 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
14211 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14212 ; AVX2-FP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
14213 ; AVX2-FP-NEXT: vbroadcastss 220(%rdi), %ymm0
14214 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14215 ; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
14216 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14217 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
14218 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14219 ; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1
14220 ; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
14221 ; AVX2-FP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7]
14222 ; AVX2-FP-NEXT: vextractf128 $1, %ymm15, %xmm15
14223 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
14224 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14225 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14226 ; AVX2-FP-NEXT: vbroadcastss 476(%rdi), %ymm0
14227 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14228 ; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
14229 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14230 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
14231 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14232 ; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1
14233 ; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
14234 ; AVX2-FP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7]
14235 ; AVX2-FP-NEXT: vextractf128 $1, %ymm15, %xmm15
14236 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
14237 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14238 ; AVX2-FP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14239 ; AVX2-FP-NEXT: vbroadcastss 732(%rdi), %ymm0
14240 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14241 ; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
14242 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14243 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
14244 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14245 ; AVX2-FP-NEXT: vextractf128 $1, %ymm1, %xmm1
14246 ; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
14247 ; AVX2-FP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7]
14248 ; AVX2-FP-NEXT: vextractf128 $1, %ymm15, %xmm15
14249 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
14250 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14251 ; AVX2-FP-NEXT: vbroadcastss 988(%rdi), %ymm0
14252 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14253 ; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
14254 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
14255 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
14256 ; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm1
14257 ; AVX2-FP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
14258 ; AVX2-FP-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7]
14259 ; AVX2-FP-NEXT: vextractf128 $1, %ymm14, %xmm14
14260 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
14261 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14262 ; AVX2-FP-NEXT: vbroadcastss 1244(%rdi), %ymm0
14263 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14264 ; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
14265 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3]
14266 ; AVX2-FP-NEXT: vextractf128 $1, %ymm11, %xmm1
14267 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm11 = ymm13[2,3,2,3,6,7,6,7]
14268 ; AVX2-FP-NEXT: vextractf128 $1, %ymm11, %xmm11
14269 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3]
14270 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14271 ; AVX2-FP-NEXT: vbroadcastss 1500(%rdi), %ymm0
14272 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14273 ; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
14274 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3]
14275 ; AVX2-FP-NEXT: vextractf128 $1, %ymm8, %xmm8
14276 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
14277 ; AVX2-FP-NEXT: vextractf128 $1, %ymm9, %xmm9
14278 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
14279 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7]
14280 ; AVX2-FP-NEXT: vbroadcastss 1756(%rdi), %ymm0
14281 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14282 ; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
14283 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3]
14284 ; AVX2-FP-NEXT: vextractf128 $1, %ymm5, %xmm5
14285 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,2,3,6,7,6,7]
14286 ; AVX2-FP-NEXT: vextractf128 $1, %ymm6, %xmm6
14287 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
14288 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
14289 ; AVX2-FP-NEXT: vbroadcastss 2012(%rdi), %ymm0
14290 ; AVX2-FP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
14291 ; AVX2-FP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
14292 ; AVX2-FP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3]
14293 ; AVX2-FP-NEXT: vextractf128 $1, %ymm2, %xmm2
14294 ; AVX2-FP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7]
14295 ; AVX2-FP-NEXT: vextractf128 $1, %ymm3, %xmm3
14296 ; AVX2-FP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
14297 ; AVX2-FP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
14298 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14299 ; AVX2-FP-NEXT: vmovaps %ymm2, 192(%rsi)
14300 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14301 ; AVX2-FP-NEXT: vmovaps %ymm2, 128(%rsi)
14302 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14303 ; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rsi)
14304 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14305 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rsi)
14306 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14307 ; AVX2-FP-NEXT: vmovaps %ymm2, 224(%rsi)
14308 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14309 ; AVX2-FP-NEXT: vmovaps %ymm2, 160(%rsi)
14310 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14311 ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rsi)
14312 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14313 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rsi)
14314 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14315 ; AVX2-FP-NEXT: vmovaps %ymm2, 192(%rdx)
14316 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14317 ; AVX2-FP-NEXT: vmovaps %ymm2, 128(%rdx)
14318 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14319 ; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rdx)
14320 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14321 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rdx)
14322 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14323 ; AVX2-FP-NEXT: vmovaps %ymm2, 224(%rdx)
14324 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14325 ; AVX2-FP-NEXT: vmovaps %ymm2, 160(%rdx)
14326 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14327 ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rdx)
14328 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14329 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rdx)
14330 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14331 ; AVX2-FP-NEXT: vmovaps %ymm2, 192(%rcx)
14332 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14333 ; AVX2-FP-NEXT: vmovaps %ymm2, 128(%rcx)
14334 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14335 ; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rcx)
14336 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14337 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rcx)
14338 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14339 ; AVX2-FP-NEXT: vmovaps %ymm2, 224(%rcx)
14340 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14341 ; AVX2-FP-NEXT: vmovaps %ymm2, 160(%rcx)
14342 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14343 ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rcx)
14344 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14345 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rcx)
14346 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14347 ; AVX2-FP-NEXT: vmovaps %ymm2, 192(%r8)
14348 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14349 ; AVX2-FP-NEXT: vmovaps %ymm2, 128(%r8)
14350 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14351 ; AVX2-FP-NEXT: vmovaps %ymm2, 64(%r8)
14352 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14353 ; AVX2-FP-NEXT: vmovaps %ymm2, (%r8)
14354 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14355 ; AVX2-FP-NEXT: vmovaps %ymm2, 224(%r8)
14356 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14357 ; AVX2-FP-NEXT: vmovaps %ymm2, 160(%r8)
14358 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14359 ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%r8)
14360 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14361 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%r8)
14362 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14363 ; AVX2-FP-NEXT: vmovaps %ymm2, 224(%r9)
14364 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14365 ; AVX2-FP-NEXT: vmovaps %ymm2, 192(%r9)
14366 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14367 ; AVX2-FP-NEXT: vmovaps %ymm2, 160(%r9)
14368 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14369 ; AVX2-FP-NEXT: vmovaps %ymm2, 128(%r9)
14370 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14371 ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%r9)
14372 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14373 ; AVX2-FP-NEXT: vmovaps %ymm2, 64(%r9)
14374 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14375 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%r9)
14376 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14377 ; AVX2-FP-NEXT: vmovaps %ymm2, (%r9)
14378 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
14379 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14380 ; AVX2-FP-NEXT: vmovaps %ymm2, 224(%rax)
14381 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14382 ; AVX2-FP-NEXT: vmovaps %ymm2, 192(%rax)
14383 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14384 ; AVX2-FP-NEXT: vmovaps %ymm2, 160(%rax)
14385 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14386 ; AVX2-FP-NEXT: vmovaps %ymm2, 128(%rax)
14387 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14388 ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rax)
14389 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14390 ; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rax)
14391 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14392 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rax)
14393 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14394 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rax)
14395 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
14396 ; AVX2-FP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
14397 ; AVX2-FP-NEXT: vmovaps %ymm2, 224(%rax)
14398 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14399 ; AVX2-FP-NEXT: vmovaps %ymm2, 192(%rax)
14400 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14401 ; AVX2-FP-NEXT: vmovaps %ymm2, 160(%rax)
14402 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14403 ; AVX2-FP-NEXT: vmovaps %ymm2, 128(%rax)
14404 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14405 ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rax)
14406 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14407 ; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rax)
14408 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14409 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rax)
14410 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
14411 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rax)
14412 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
14413 ; AVX2-FP-NEXT: vmovaps %ymm0, 224(%rax)
14414 ; AVX2-FP-NEXT: vmovaps %ymm5, 192(%rax)
14415 ; AVX2-FP-NEXT: vmovaps %ymm8, 160(%rax)
14416 ; AVX2-FP-NEXT: vmovaps %ymm1, 128(%rax)
14417 ; AVX2-FP-NEXT: vmovaps %ymm14, 96(%rax)
14418 ; AVX2-FP-NEXT: vmovaps %ymm15, 64(%rax)
14419 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14420 ; AVX2-FP-NEXT: vmovaps %ymm0, 32(%rax)
14421 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
14422 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rax)
14423 ; AVX2-FP-NEXT: addq $3528, %rsp # imm = 0xDC8
14424 ; AVX2-FP-NEXT: vzeroupper
14425 ; AVX2-FP-NEXT: retq
14427 ; AVX2-FCP-LABEL: load_i32_stride8_vf64:
14428 ; AVX2-FCP: # %bb.0:
14429 ; AVX2-FCP-NEXT: subq $3528, %rsp # imm = 0xDC8
14430 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %xmm10
14431 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %xmm0
14432 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14433 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
14434 ; AVX2-FCP-NEXT: vmovaps %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14435 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %xmm2
14436 ; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm1
14437 ; AVX2-FCP-NEXT: vmovaps %xmm2, %xmm9
14438 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %xmm2
14439 ; AVX2-FCP-NEXT: vmovaps %xmm2, (%rsp) # 16-byte Spill
14440 ; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm2
14441 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14442 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14443 ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %xmm1
14444 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14445 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %xmm2
14446 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14447 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14448 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
14449 ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %xmm3
14450 ; AVX2-FCP-NEXT: vbroadcastss %xmm3, %xmm2
14451 ; AVX2-FCP-NEXT: vmovaps %xmm3, %xmm13
14452 ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %xmm3
14453 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14454 ; AVX2-FCP-NEXT: vbroadcastss %xmm3, %xmm3
14455 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
14456 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
14457 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
14458 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14459 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14460 ; AVX2-FCP-NEXT: vmovaps 800(%rdi), %xmm0
14461 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14462 ; AVX2-FCP-NEXT: vmovaps 768(%rdi), %xmm1
14463 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14464 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
14465 ; AVX2-FCP-NEXT: vmovaps 864(%rdi), %xmm12
14466 ; AVX2-FCP-NEXT: vbroadcastss %xmm12, %xmm1
14467 ; AVX2-FCP-NEXT: vmovaps 832(%rdi), %xmm2
14468 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14469 ; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm2
14470 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14471 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14472 ; AVX2-FCP-NEXT: vmovaps 992(%rdi), %xmm1
14473 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14474 ; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1
14475 ; AVX2-FCP-NEXT: vmovaps 960(%rdi), %xmm2
14476 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14477 ; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm2
14478 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14479 ; AVX2-FCP-NEXT: vmovaps 928(%rdi), %xmm2
14480 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14481 ; AVX2-FCP-NEXT: vmovaps 896(%rdi), %xmm3
14482 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14483 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
14484 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
14485 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
14486 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14487 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14488 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14489 ; AVX2-FCP-NEXT: vmovaps 1376(%rdi), %xmm0
14490 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14491 ; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm0
14492 ; AVX2-FCP-NEXT: vmovaps 1344(%rdi), %xmm1
14493 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14494 ; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1
14495 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
14496 ; AVX2-FCP-NEXT: vmovaps 1312(%rdi), %xmm1
14497 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14498 ; AVX2-FCP-NEXT: vmovaps 1280(%rdi), %xmm2
14499 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14500 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14501 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
14502 ; AVX2-FCP-NEXT: vmovaps 1504(%rdi), %xmm1
14503 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14504 ; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1
14505 ; AVX2-FCP-NEXT: vmovaps 1472(%rdi), %xmm2
14506 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14507 ; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm2
14508 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14509 ; AVX2-FCP-NEXT: vmovaps 1440(%rdi), %xmm2
14510 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14511 ; AVX2-FCP-NEXT: vmovaps 1408(%rdi), %xmm3
14512 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14513 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
14514 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
14515 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
14516 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14517 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14518 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14519 ; AVX2-FCP-NEXT: vmovaps 1888(%rdi), %xmm0
14520 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14521 ; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm0
14522 ; AVX2-FCP-NEXT: vmovaps 1856(%rdi), %xmm1
14523 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14524 ; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1
14525 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
14526 ; AVX2-FCP-NEXT: vmovaps 1824(%rdi), %xmm1
14527 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14528 ; AVX2-FCP-NEXT: vmovaps 1792(%rdi), %xmm2
14529 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14530 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14531 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
14532 ; AVX2-FCP-NEXT: vmovaps 2016(%rdi), %xmm0
14533 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14534 ; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm1
14535 ; AVX2-FCP-NEXT: vmovaps 1984(%rdi), %xmm0
14536 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14537 ; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm2
14538 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14539 ; AVX2-FCP-NEXT: vmovaps 1952(%rdi), %xmm0
14540 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14541 ; AVX2-FCP-NEXT: vmovaps 1920(%rdi), %xmm2
14542 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14543 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
14544 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
14545 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
14546 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14547 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm1[4,5,6,7]
14548 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14549 ; AVX2-FCP-NEXT: vmovaps 608(%rdi), %xmm0
14550 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14551 ; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm0
14552 ; AVX2-FCP-NEXT: vmovaps 576(%rdi), %xmm1
14553 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14554 ; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1
14555 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
14556 ; AVX2-FCP-NEXT: vmovaps 544(%rdi), %xmm2
14557 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14558 ; AVX2-FCP-NEXT: vmovaps 512(%rdi), %xmm1
14559 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14560 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
14561 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
14562 ; AVX2-FCP-NEXT: vmovaps 736(%rdi), %xmm1
14563 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14564 ; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1
14565 ; AVX2-FCP-NEXT: vmovaps 704(%rdi), %xmm2
14566 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14567 ; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm2
14568 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14569 ; AVX2-FCP-NEXT: vmovaps 672(%rdi), %xmm3
14570 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14571 ; AVX2-FCP-NEXT: vmovaps 640(%rdi), %xmm2
14572 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14573 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
14574 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
14575 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
14576 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14577 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14578 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14579 ; AVX2-FCP-NEXT: vmovaps 1120(%rdi), %xmm0
14580 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14581 ; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm0
14582 ; AVX2-FCP-NEXT: vmovaps 1088(%rdi), %xmm1
14583 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14584 ; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1
14585 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
14586 ; AVX2-FCP-NEXT: vmovaps 1056(%rdi), %xmm2
14587 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14588 ; AVX2-FCP-NEXT: vmovaps 1024(%rdi), %xmm1
14589 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14590 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
14591 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
14592 ; AVX2-FCP-NEXT: vmovaps 1248(%rdi), %xmm1
14593 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14594 ; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1
14595 ; AVX2-FCP-NEXT: vmovaps 1216(%rdi), %xmm2
14596 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14597 ; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm2
14598 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14599 ; AVX2-FCP-NEXT: vmovaps 1184(%rdi), %xmm3
14600 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14601 ; AVX2-FCP-NEXT: vmovaps 1152(%rdi), %xmm2
14602 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14603 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
14604 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
14605 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
14606 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14607 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14608 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14609 ; AVX2-FCP-NEXT: vmovaps 1632(%rdi), %xmm0
14610 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14611 ; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm0
14612 ; AVX2-FCP-NEXT: vmovaps 1600(%rdi), %xmm1
14613 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14614 ; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1
14615 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
14616 ; AVX2-FCP-NEXT: vmovaps 1568(%rdi), %xmm2
14617 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14618 ; AVX2-FCP-NEXT: vmovaps 1536(%rdi), %xmm1
14619 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14620 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
14621 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
14622 ; AVX2-FCP-NEXT: vmovaps 1760(%rdi), %xmm1
14623 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14624 ; AVX2-FCP-NEXT: vbroadcastss %xmm1, %xmm1
14625 ; AVX2-FCP-NEXT: vmovaps 1728(%rdi), %xmm2
14626 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14627 ; AVX2-FCP-NEXT: vbroadcastss %xmm2, %xmm2
14628 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14629 ; AVX2-FCP-NEXT: vmovaps 1696(%rdi), %xmm3
14630 ; AVX2-FCP-NEXT: vmovaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14631 ; AVX2-FCP-NEXT: vmovaps 1664(%rdi), %xmm2
14632 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14633 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
14634 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
14635 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
14636 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14637 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14638 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14639 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %xmm0
14640 ; AVX2-FCP-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14641 ; AVX2-FCP-NEXT: vbroadcastss %xmm0, %xmm0
14642 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %xmm11
14643 ; AVX2-FCP-NEXT: vbroadcastss %xmm11, %xmm1
14644 ; AVX2-FCP-NEXT: vmovaps %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14645 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
14646 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %xmm2
14647 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14648 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %xmm1
14649 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14650 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
14651 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
14652 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
14653 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14654 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %xmm8
14655 ; AVX2-FCP-NEXT: vbroadcastss %xmm8, %xmm1
14656 ; AVX2-FCP-NEXT: vmovaps %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14657 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %xmm7
14658 ; AVX2-FCP-NEXT: vbroadcastss %xmm7, %xmm2
14659 ; AVX2-FCP-NEXT: vmovaps %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14660 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14661 ; AVX2-FCP-NEXT: vmovaps (%rdi), %xmm5
14662 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %xmm6
14663 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
14664 ; AVX2-FCP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14665 ; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14666 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm3[2,3]
14667 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm4[4,5,6,7]
14668 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14669 ; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
14670 ; AVX2-FCP-NEXT: # xmm0 = mem[1,1,1,1]
14671 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3]
14672 ; AVX2-FCP-NEXT: vmovaps %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14673 ; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
14674 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1]
14675 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14676 ; AVX2-FCP-NEXT: vmovaps %xmm13, %xmm9
14677 ; AVX2-FCP-NEXT: vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14678 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
14679 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
14680 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
14681 ; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
14682 ; AVX2-FCP-NEXT: # xmm2 = mem[1,1,1,1]
14683 ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
14684 ; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
14685 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
14686 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14687 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14688 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14689 ; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
14690 ; AVX2-FCP-NEXT: # xmm0 = mem[1,1,1,1]
14691 ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14692 ; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
14693 ; AVX2-FCP-NEXT: vmovaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14694 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14695 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1]
14696 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14697 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
14698 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
14699 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
14700 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
14701 ; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
14702 ; AVX2-FCP-NEXT: # xmm2 = mem[1,1,1,1]
14703 ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
14704 ; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
14705 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
14706 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14707 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14708 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14709 ; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
14710 ; AVX2-FCP-NEXT: # xmm0 = mem[1,1,1,1]
14711 ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14712 ; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
14713 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14714 ; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14715 ; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
14716 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14717 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
14718 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
14719 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
14720 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
14721 ; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
14722 ; AVX2-FCP-NEXT: # xmm2 = mem[1,1,1,1]
14723 ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
14724 ; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
14725 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
14726 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14727 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14728 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14729 ; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
14730 ; AVX2-FCP-NEXT: # xmm0 = mem[1,1,1,1]
14731 ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14732 ; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
14733 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14734 ; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14735 ; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
14736 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14737 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14738 ; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14739 ; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
14740 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
14741 ; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
14742 ; AVX2-FCP-NEXT: # xmm2 = mem[1,1,1,1]
14743 ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
14744 ; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
14745 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
14746 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14747 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14748 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14749 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm5[1,1,1,1]
14750 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3]
14751 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
14752 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14753 ; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm1 # 16-byte Folded Reload
14754 ; AVX2-FCP-NEXT: # xmm1 = xmm11[0],mem[0],xmm11[1],mem[1]
14755 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
14756 ; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
14757 ; AVX2-FCP-NEXT: # xmm2 = mem[1,1,1,1]
14758 ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
14759 ; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
14760 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
14761 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14762 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14763 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14764 ; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
14765 ; AVX2-FCP-NEXT: # xmm0 = mem[1,1,1,1]
14766 ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14767 ; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
14768 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14769 ; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14770 ; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
14771 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14772 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
14773 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
14774 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
14775 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
14776 ; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
14777 ; AVX2-FCP-NEXT: # xmm2 = mem[1,1,1,1]
14778 ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
14779 ; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
14780 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
14781 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14782 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14783 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14784 ; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
14785 ; AVX2-FCP-NEXT: # xmm0 = mem[1,1,1,1]
14786 ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14787 ; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
14788 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14789 ; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14790 ; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
14791 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14792 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
14793 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
14794 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} xmm1 = xmm11[0],xmm5[0],xmm11[1],xmm5[1]
14795 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
14796 ; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
14797 ; AVX2-FCP-NEXT: # xmm2 = mem[1,1,1,1]
14798 ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
14799 ; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
14800 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
14801 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14802 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14803 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14804 ; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
14805 ; AVX2-FCP-NEXT: # xmm0 = mem[1,1,1,1]
14806 ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14807 ; AVX2-FCP-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
14808 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14809 ; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14810 ; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
14811 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14812 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14813 ; AVX2-FCP-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14814 ; AVX2-FCP-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
14815 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
14816 ; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
14817 ; AVX2-FCP-NEXT: # xmm2 = mem[1,1,1,1]
14818 ; AVX2-FCP-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
14819 ; AVX2-FCP-NEXT: # xmm2 = xmm2[0],mem[1],xmm2[2,3]
14820 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
14821 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14822 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
14823 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14824 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14825 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
14826 ; AVX2-FCP-NEXT: # xmm6 = xmm0[2],mem[2],xmm0[3],mem[3]
14827 ; AVX2-FCP-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14828 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14829 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
14830 ; AVX2-FCP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
14831 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14832 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm9[2,2,2,2]
14833 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm10[0,1,2],xmm0[3]
14834 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
14835 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
14836 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14837 ; AVX2-FCP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
14838 ; AVX2-FCP-NEXT: # xmm1 = mem[2,2,2,2]
14839 ; AVX2-FCP-NEXT: vblendps $7, (%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
14840 ; AVX2-FCP-NEXT: # xmm1 = mem[0,1,2],xmm1[3]
14841 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
14842 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14843 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14844 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14845 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
14846 ; AVX2-FCP-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
14847 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14848 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14849 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
14850 ; AVX2-FCP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
14851 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14852 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm12[2,2,2,2]
14853 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm13[0,1,2],xmm0[3]
14854 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
14855 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
14856 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14857 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
14858 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm9[2,2,2,2]
14859 ; AVX2-FCP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14860 ; AVX2-FCP-NEXT: # xmm1 = mem[0,1,2],xmm1[3]
14861 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
14862 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14863 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14864 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14865 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
14866 ; AVX2-FCP-NEXT: # xmm10 = xmm0[2],mem[2],xmm0[3],mem[3]
14867 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14868 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
14869 ; AVX2-FCP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
14870 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14871 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm14[2,2,2,2]
14872 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
14873 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
14874 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
14875 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14876 ; AVX2-FCP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
14877 ; AVX2-FCP-NEXT: # xmm1 = mem[2,2,2,2]
14878 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
14879 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0,1,2],xmm1[3]
14880 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
14881 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14882 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14883 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14884 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
14885 ; AVX2-FCP-NEXT: # xmm8 = xmm0[2],mem[2],xmm0[3],mem[3]
14886 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14887 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
14888 ; AVX2-FCP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
14889 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14890 ; AVX2-FCP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
14891 ; AVX2-FCP-NEXT: # xmm0 = mem[2,2,2,2]
14892 ; AVX2-FCP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14893 ; AVX2-FCP-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
14894 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
14895 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
14896 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14897 ; AVX2-FCP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
14898 ; AVX2-FCP-NEXT: # xmm1 = mem[2,2,2,2]
14899 ; AVX2-FCP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14900 ; AVX2-FCP-NEXT: # xmm1 = mem[0,1,2],xmm1[3]
14901 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm8[0,1],xmm1[2,3]
14902 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14903 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14904 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14905 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
14906 ; AVX2-FCP-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
14907 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14908 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14909 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
14910 ; AVX2-FCP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
14911 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14912 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm3[2,2,2,2]
14913 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3]
14914 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
14915 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
14916 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14917 ; AVX2-FCP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
14918 ; AVX2-FCP-NEXT: # xmm1 = mem[2,2,2,2]
14919 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
14920 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm6[0,1,2],xmm1[3]
14921 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
14922 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14923 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14924 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14925 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
14926 ; AVX2-FCP-NEXT: # xmm2 = xmm0[2],mem[2],xmm0[3],mem[3]
14927 ; AVX2-FCP-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14928 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14929 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
14930 ; AVX2-FCP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
14931 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14932 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm0 = xmm5[2,2,2,2]
14933 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
14934 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
14935 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
14936 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14937 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
14938 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,2,2,2]
14939 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
14940 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
14941 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
14942 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
14943 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14944 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14945 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
14946 ; AVX2-FCP-NEXT: # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3]
14947 ; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14948 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14949 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
14950 ; AVX2-FCP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
14951 ; AVX2-FCP-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14952 ; AVX2-FCP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
14953 ; AVX2-FCP-NEXT: # xmm0 = mem[2,2,2,2]
14954 ; AVX2-FCP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14955 ; AVX2-FCP-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
14956 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
14957 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm15
14958 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5],ymm0[6,7]
14959 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
14960 ; AVX2-FCP-NEXT: vshufps {{.*#+}} xmm15 = xmm2[2,2,2,2]
14961 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14962 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm1[0,1,2],xmm15[3]
14963 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm5[0,1],xmm15[2,3]
14964 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
14965 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14966 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14967 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
14968 ; AVX2-FCP-NEXT: # xmm5 = xmm0[2],mem[2],xmm0[3],mem[3]
14969 ; AVX2-FCP-NEXT: vmovaps %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14970 ; AVX2-FCP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
14971 ; AVX2-FCP-NEXT: # xmm15 = mem[2,2,2,2]
14972 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14973 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm15 = xmm0[0,1,2],xmm15[3]
14974 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
14975 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm14
14976 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5],ymm15[6,7]
14977 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
14978 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm15 # 16-byte Folded Reload
14979 ; AVX2-FCP-NEXT: # xmm15 = xmm5[2],mem[2],xmm5[3],mem[3]
14980 ; AVX2-FCP-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
14981 ; AVX2-FCP-NEXT: # xmm13 = mem[2,2,2,2]
14982 ; AVX2-FCP-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
14983 ; AVX2-FCP-NEXT: # xmm13 = mem[0,1,2],xmm13[3]
14984 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm13 = xmm15[0,1],xmm13[2,3]
14985 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm14[4,5,6,7]
14986 ; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14987 ; AVX2-FCP-NEXT: vmovaps (%rsp), %xmm5 # 16-byte Reload
14988 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm13 # 16-byte Folded Reload
14989 ; AVX2-FCP-NEXT: # xmm13 = xmm5[2],mem[2],xmm5[3],mem[3]
14990 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
14991 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm13 = xmm5[1],xmm13[1]
14992 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
14993 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm14 # 16-byte Folded Reload
14994 ; AVX2-FCP-NEXT: # xmm14 = xmm5[2],mem[2],xmm5[3],mem[3]
14995 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
14996 ; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
14997 ; AVX2-FCP-NEXT: # xmm12 = mem[2,3,2,3]
14998 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
14999 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5],ymm14[6,7]
15000 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
15001 ; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15002 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
15003 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm12 = xmm5[2],xmm9[2],xmm5[3],xmm9[3]
15004 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
15005 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm11 = xmm5[1],xmm12[1]
15006 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
15007 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm12 # 16-byte Folded Reload
15008 ; AVX2-FCP-NEXT: # xmm12 = xmm5[2],mem[2],xmm5[3],mem[3]
15009 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm12, %ymm0, %ymm12
15010 ; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
15011 ; AVX2-FCP-NEXT: # xmm13 = mem[2,3,2,3]
15012 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm13, %ymm0, %ymm13
15013 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
15014 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
15015 ; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15016 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm11 # 16-byte Folded Reload
15017 ; AVX2-FCP-NEXT: # xmm11 = xmm7[2],mem[2],xmm7[3],mem[3]
15018 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm9 = xmm10[1],xmm11[1]
15019 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
15020 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm11 # 16-byte Folded Reload
15021 ; AVX2-FCP-NEXT: # xmm11 = xmm5[2],mem[2],xmm5[3],mem[3]
15022 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
15023 ; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
15024 ; AVX2-FCP-NEXT: # xmm10 = mem[2,3,2,3]
15025 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
15026 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
15027 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
15028 ; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15029 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
15030 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload
15031 ; AVX2-FCP-NEXT: # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3]
15032 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm7 = xmm8[1],xmm9[1]
15033 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
15034 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm9 # 16-byte Folded Reload
15035 ; AVX2-FCP-NEXT: # xmm9 = xmm5[2],mem[2],xmm5[3],mem[3]
15036 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
15037 ; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
15038 ; AVX2-FCP-NEXT: # xmm8 = mem[2,3,2,3]
15039 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
15040 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm9[6,7]
15041 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm8[4,5,6,7]
15042 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15043 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload
15044 ; AVX2-FCP-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3]
15045 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
15046 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm5 = xmm5[1],xmm7[1]
15047 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
15048 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm7 # 16-byte Folded Reload
15049 ; AVX2-FCP-NEXT: # xmm7 = xmm6[2],mem[2],xmm6[3],mem[3]
15050 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
15051 ; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
15052 ; AVX2-FCP-NEXT: # xmm6 = mem[2,3,2,3]
15053 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
15054 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
15055 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
15056 ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15057 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm5 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
15058 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
15059 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm3 = xmm3[1],xmm5[1]
15060 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
15061 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm5 # 16-byte Folded Reload
15062 ; AVX2-FCP-NEXT: # xmm5 = xmm4[2],mem[2],xmm4[3],mem[3]
15063 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
15064 ; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
15065 ; AVX2-FCP-NEXT: # xmm4 = mem[2,3,2,3]
15066 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
15067 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm5[6,7]
15068 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
15069 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15070 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
15071 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15072 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm1[1],xmm3[1]
15073 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
15074 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
15075 ; AVX2-FCP-NEXT: # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3]
15076 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
15077 ; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
15078 ; AVX2-FCP-NEXT: # xmm2 = mem[2,3,2,3]
15079 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
15080 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
15081 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
15082 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15083 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
15084 ; AVX2-FCP-NEXT: # xmm1 = xmm0[2],mem[2],xmm0[3],mem[3]
15085 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
15086 ; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
15087 ; AVX2-FCP-NEXT: # xmm0 = mem[2,3,2,3]
15088 ; AVX2-FCP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
15089 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
15090 ; AVX2-FCP-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15091 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
15092 ; AVX2-FCP-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
15093 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} xmm1 = xmm15[1],xmm1[1]
15094 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15095 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15096 ; AVX2-FCP-NEXT: vmovaps 32(%rdi), %ymm0
15097 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15098 ; AVX2-FCP-NEXT: vmovaps (%rdi), %ymm1
15099 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15100 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
15101 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
15102 ; AVX2-FCP-NEXT: vmovaps 96(%rdi), %ymm1
15103 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15104 ; AVX2-FCP-NEXT: vmovaps 64(%rdi), %ymm2
15105 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15106 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
15107 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15108 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
15109 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
15110 ; AVX2-FCP-NEXT: vmovaps 224(%rdi), %ymm2
15111 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15112 ; AVX2-FCP-NEXT: vmovaps 192(%rdi), %ymm3
15113 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15114 ; AVX2-FCP-NEXT: vmovaps 160(%rdi), %ymm15
15115 ; AVX2-FCP-NEXT: vmovaps 128(%rdi), %ymm1
15116 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15117 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[4],ymm15[4],ymm1[5],ymm15[5]
15118 ; AVX2-FCP-NEXT: vmovups %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15119 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
15120 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15121 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[2],ymm2[2]
15122 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15123 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15124 ; AVX2-FCP-NEXT: vmovaps 288(%rdi), %ymm0
15125 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15126 ; AVX2-FCP-NEXT: vmovaps 256(%rdi), %ymm1
15127 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15128 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
15129 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
15130 ; AVX2-FCP-NEXT: vmovaps 352(%rdi), %ymm1
15131 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15132 ; AVX2-FCP-NEXT: vmovaps 320(%rdi), %ymm2
15133 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15134 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
15135 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15136 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
15137 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
15138 ; AVX2-FCP-NEXT: vmovaps 480(%rdi), %ymm2
15139 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15140 ; AVX2-FCP-NEXT: vmovaps 448(%rdi), %ymm3
15141 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15142 ; AVX2-FCP-NEXT: vmovaps 416(%rdi), %ymm4
15143 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15144 ; AVX2-FCP-NEXT: vmovaps 384(%rdi), %ymm1
15145 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15146 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5]
15147 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm12 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
15148 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm12[0],ymm1[2],ymm12[2]
15149 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15150 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15151 ; AVX2-FCP-NEXT: vmovaps 544(%rdi), %ymm0
15152 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15153 ; AVX2-FCP-NEXT: vmovaps 512(%rdi), %ymm1
15154 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15155 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
15156 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
15157 ; AVX2-FCP-NEXT: vmovaps 608(%rdi), %ymm1
15158 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15159 ; AVX2-FCP-NEXT: vmovaps 576(%rdi), %ymm2
15160 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15161 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
15162 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15163 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
15164 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
15165 ; AVX2-FCP-NEXT: vmovaps 736(%rdi), %ymm2
15166 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15167 ; AVX2-FCP-NEXT: vmovaps 704(%rdi), %ymm3
15168 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15169 ; AVX2-FCP-NEXT: vmovaps 672(%rdi), %ymm4
15170 ; AVX2-FCP-NEXT: vmovaps 640(%rdi), %ymm1
15171 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15172 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5]
15173 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15174 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm8 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[4],ymm2[4],ymm3[5],ymm2[5]
15175 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm8[0],ymm1[2],ymm8[2]
15176 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15177 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15178 ; AVX2-FCP-NEXT: vmovaps 800(%rdi), %ymm0
15179 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15180 ; AVX2-FCP-NEXT: vmovaps 768(%rdi), %ymm1
15181 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15182 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
15183 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
15184 ; AVX2-FCP-NEXT: vmovaps 864(%rdi), %ymm1
15185 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15186 ; AVX2-FCP-NEXT: vmovaps 832(%rdi), %ymm2
15187 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15188 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
15189 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15190 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
15191 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
15192 ; AVX2-FCP-NEXT: vmovaps 992(%rdi), %ymm2
15193 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15194 ; AVX2-FCP-NEXT: vmovaps 960(%rdi), %ymm5
15195 ; AVX2-FCP-NEXT: vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15196 ; AVX2-FCP-NEXT: vmovaps 928(%rdi), %ymm3
15197 ; AVX2-FCP-NEXT: vmovaps 896(%rdi), %ymm1
15198 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15199 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[4],ymm3[4],ymm1[5],ymm3[5]
15200 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15201 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm5 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[4],ymm2[4],ymm5[5],ymm2[5]
15202 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[2],ymm5[2]
15203 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15204 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15205 ; AVX2-FCP-NEXT: vmovaps 1056(%rdi), %ymm0
15206 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15207 ; AVX2-FCP-NEXT: vmovaps 1024(%rdi), %ymm1
15208 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15209 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[4],ymm0[4],ymm1[5],ymm0[5]
15210 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
15211 ; AVX2-FCP-NEXT: vmovaps 1120(%rdi), %ymm1
15212 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15213 ; AVX2-FCP-NEXT: vmovaps 1088(%rdi), %ymm2
15214 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15215 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[4],ymm1[4],ymm2[5],ymm1[5]
15216 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15217 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,2,2,2]
15218 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
15219 ; AVX2-FCP-NEXT: vmovaps 1248(%rdi), %ymm1
15220 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15221 ; AVX2-FCP-NEXT: vmovaps 1216(%rdi), %ymm7
15222 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15223 ; AVX2-FCP-NEXT: vmovaps 1184(%rdi), %ymm2
15224 ; AVX2-FCP-NEXT: vmovaps 1152(%rdi), %ymm6
15225 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15226 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[4],ymm2[4],ymm6[5],ymm2[5]
15227 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15228 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm1 = ymm7[0],ymm1[0],ymm7[1],ymm1[1],ymm7[4],ymm1[4],ymm7[5],ymm1[5]
15229 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm6 = ymm6[0],ymm1[0],ymm6[2],ymm1[2]
15230 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
15231 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15232 ; AVX2-FCP-NEXT: vmovaps 1312(%rdi), %ymm0
15233 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15234 ; AVX2-FCP-NEXT: vmovaps 1280(%rdi), %ymm6
15235 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15236 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm6[0],ymm0[0],ymm6[1],ymm0[1],ymm6[4],ymm0[4],ymm6[5],ymm0[5]
15237 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm6
15238 ; AVX2-FCP-NEXT: vmovaps 1376(%rdi), %ymm0
15239 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15240 ; AVX2-FCP-NEXT: vmovaps 1344(%rdi), %ymm7
15241 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15242 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm7[0],ymm0[0],ymm7[1],ymm0[1],ymm7[4],ymm0[4],ymm7[5],ymm0[5]
15243 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15244 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[2,2,2,2]
15245 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm7 = xmm6[0,1],xmm7[2,3]
15246 ; AVX2-FCP-NEXT: vmovaps 1504(%rdi), %ymm6
15247 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15248 ; AVX2-FCP-NEXT: vmovaps 1472(%rdi), %ymm10
15249 ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15250 ; AVX2-FCP-NEXT: vmovaps 1440(%rdi), %ymm0
15251 ; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
15252 ; AVX2-FCP-NEXT: vmovaps 1408(%rdi), %ymm9
15253 ; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15254 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[1],ymm0[1],ymm9[4],ymm0[4],ymm9[5],ymm0[5]
15255 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm0 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[4],ymm6[4],ymm10[5],ymm6[5]
15256 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm9 = ymm9[0],ymm0[0],ymm9[2],ymm0[2]
15257 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm9[4,5,6,7]
15258 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15259 ; AVX2-FCP-NEXT: vmovaps 1568(%rdi), %ymm6
15260 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15261 ; AVX2-FCP-NEXT: vmovaps 1536(%rdi), %ymm7
15262 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15263 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm7 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
15264 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm7, %xmm9
15265 ; AVX2-FCP-NEXT: vmovaps 1632(%rdi), %ymm6
15266 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15267 ; AVX2-FCP-NEXT: vmovaps 1600(%rdi), %ymm7
15268 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15269 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
15270 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15271 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm10 = ymm6[2,2,2,2]
15272 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm10 = xmm9[0,1],xmm10[2,3]
15273 ; AVX2-FCP-NEXT: vmovaps 1760(%rdi), %ymm9
15274 ; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15275 ; AVX2-FCP-NEXT: vmovaps 1728(%rdi), %ymm6
15276 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15277 ; AVX2-FCP-NEXT: vmovaps 1696(%rdi), %ymm7
15278 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15279 ; AVX2-FCP-NEXT: vmovaps 1664(%rdi), %ymm11
15280 ; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15281 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm11[0],ymm7[0],ymm11[1],ymm7[1],ymm11[4],ymm7[4],ymm11[5],ymm7[5]
15282 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm6[0],ymm9[0],ymm6[1],ymm9[1],ymm6[4],ymm9[4],ymm6[5],ymm9[5]
15283 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15284 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm11 = ymm11[0],ymm6[0],ymm11[2],ymm6[2]
15285 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
15286 ; AVX2-FCP-NEXT: vmovups %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15287 ; AVX2-FCP-NEXT: vmovaps 1824(%rdi), %ymm6
15288 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15289 ; AVX2-FCP-NEXT: vmovaps 1792(%rdi), %ymm7
15290 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15291 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm10 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
15292 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm11
15293 ; AVX2-FCP-NEXT: vmovaps 1888(%rdi), %ymm6
15294 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15295 ; AVX2-FCP-NEXT: vmovaps 1856(%rdi), %ymm7
15296 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15297 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm6 = ymm7[0],ymm6[0],ymm7[1],ymm6[1],ymm7[4],ymm6[4],ymm7[5],ymm6[5]
15298 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15299 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm13 = ymm6[2,2,2,2]
15300 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm13 = xmm11[0,1],xmm13[2,3]
15301 ; AVX2-FCP-NEXT: vmovaps 2016(%rdi), %ymm11
15302 ; AVX2-FCP-NEXT: vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15303 ; AVX2-FCP-NEXT: vmovaps 1984(%rdi), %ymm6
15304 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15305 ; AVX2-FCP-NEXT: vmovaps 1952(%rdi), %ymm7
15306 ; AVX2-FCP-NEXT: vmovaps 1920(%rdi), %ymm9
15307 ; AVX2-FCP-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15308 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm14 = ymm9[0],ymm7[0],ymm9[1],ymm7[1],ymm9[4],ymm7[4],ymm9[5],ymm7[5]
15309 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15310 ; AVX2-FCP-NEXT: vunpcklps {{.*#+}} ymm11 = ymm6[0],ymm11[0],ymm6[1],ymm11[1],ymm6[4],ymm11[4],ymm6[5],ymm11[5]
15311 ; AVX2-FCP-NEXT: vunpcklpd {{.*#+}} ymm14 = ymm14[0],ymm11[0],ymm14[2],ymm11[2]
15312 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
15313 ; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15314 ; AVX2-FCP-NEXT: vbroadcastss 148(%rdi), %ymm13
15315 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
15316 ; AVX2-FCP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
15317 ; AVX2-FCP-NEXT: # ymm13 = ymm13[0,1,2,3,4,5],mem[6,7]
15318 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
15319 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm14
15320 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
15321 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm15 = ymm9[1,1,1,1,5,5,5,5]
15322 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
15323 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm15[0],ymm6[1],ymm15[2,3,4],ymm6[5],ymm15[6,7]
15324 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm15, %xmm15
15325 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
15326 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
15327 ; AVX2-FCP-NEXT: vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15328 ; AVX2-FCP-NEXT: vbroadcastss 404(%rdi), %ymm13
15329 ; AVX2-FCP-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm13 # 32-byte Folded Reload
15330 ; AVX2-FCP-NEXT: # ymm13 = ymm13[0,1,2,3,4],mem[5],ymm13[6,7]
15331 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
15332 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
15333 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm10, %xmm13
15334 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
15335 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm14 = ymm15[1,1,1,1,5,5,5,5]
15336 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
15337 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7]
15338 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm14
15339 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
15340 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
15341 ; AVX2-FCP-NEXT: vmovups %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15342 ; AVX2-FCP-NEXT: vbroadcastss 660(%rdi), %ymm12
15343 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm4[5],ymm12[6,7]
15344 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5],ymm8[6,7]
15345 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
15346 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm8, %xmm8
15347 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
15348 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm12 = ymm13[1,1,1,1,5,5,5,5]
15349 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
15350 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm12 = ymm12[0],ymm14[1],ymm12[2,3,4],ymm14[5],ymm12[6,7]
15351 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm12, %xmm12
15352 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm8 = xmm12[0,1],xmm8[2,3]
15353 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
15354 ; AVX2-FCP-NEXT: vmovups %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15355 ; AVX2-FCP-NEXT: vbroadcastss 916(%rdi), %ymm4
15356 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
15357 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm5[6,7]
15358 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
15359 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm4, %xmm4
15360 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
15361 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm5 = ymm8[1,1,1,1,5,5,5,5]
15362 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
15363 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0],ymm12[1],ymm5[2,3,4],ymm12[5],ymm5[6,7]
15364 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5
15365 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
15366 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
15367 ; AVX2-FCP-NEXT: vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15368 ; AVX2-FCP-NEXT: vbroadcastss 1172(%rdi), %ymm3
15369 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
15370 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
15371 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15372 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2
15373 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
15374 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm4[1,1,1,1,5,5,5,5]
15375 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
15376 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3,4],ymm5[5],ymm3[6,7]
15377 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm3, %xmm3
15378 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
15379 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
15380 ; AVX2-FCP-NEXT: vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15381 ; AVX2-FCP-NEXT: vbroadcastss 1428(%rdi), %ymm1
15382 ; AVX2-FCP-NEXT: vblendps $32, (%rsp), %ymm1, %ymm1 # 32-byte Folded Reload
15383 ; AVX2-FCP-NEXT: # ymm1 = ymm1[0,1,2,3,4],mem[5],ymm1[6,7]
15384 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7]
15385 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15386 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm0, %xmm0
15387 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
15388 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5]
15389 ; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
15390 ; AVX2-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
15391 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2
15392 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
15393 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15394 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15395 ; AVX2-FCP-NEXT: vbroadcastss 1684(%rdi), %ymm0
15396 ; AVX2-FCP-NEXT: vblendps $32, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15397 ; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1,2,3,4],mem[5],ymm0[6,7]
15398 ; AVX2-FCP-NEXT: vblendps $192, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15399 ; AVX2-FCP-NEXT: # ymm0 = ymm0[0,1,2,3,4,5],mem[6,7]
15400 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15401 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1
15402 ; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
15403 ; AVX2-FCP-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5]
15404 ; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
15405 ; AVX2-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
15406 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2
15407 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
15408 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15409 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15410 ; AVX2-FCP-NEXT: vbroadcastss 1940(%rdi), %ymm0
15411 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm7[5],ymm0[6,7]
15412 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm11[6,7]
15413 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15414 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1
15415 ; AVX2-FCP-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
15416 ; AVX2-FCP-NEXT: # ymm2 = mem[1,1,1,1,5,5,5,5]
15417 ; AVX2-FCP-NEXT: vblendps $34, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
15418 ; AVX2-FCP-NEXT: # ymm2 = ymm2[0],mem[1],ymm2[2,3,4],mem[5],ymm2[6,7]
15419 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2
15420 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
15421 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15422 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15423 ; AVX2-FCP-NEXT: vbroadcastss 248(%rdi), %ymm0
15424 ; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15425 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
15426 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15427 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
15428 ; AVX2-FCP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15429 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15430 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm9[2],ymm6[2],ymm9[3],ymm6[3],ymm9[6],ymm6[6],ymm9[7],ymm6[7]
15431 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15432 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15433 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
15434 ; AVX2-FCP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15435 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15436 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm1
15437 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm6[2,2,2,2]
15438 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
15439 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
15440 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15441 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15442 ; AVX2-FCP-NEXT: vbroadcastss 504(%rdi), %ymm0
15443 ; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15444 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
15445 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15446 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
15447 ; AVX2-FCP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15448 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15449 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm15[2],ymm10[2],ymm15[3],ymm10[3],ymm15[6],ymm10[6],ymm15[7],ymm10[7]
15450 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15451 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15452 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
15453 ; AVX2-FCP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15454 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15455 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm1
15456 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2]
15457 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
15458 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
15459 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15460 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15461 ; AVX2-FCP-NEXT: vbroadcastss 760(%rdi), %ymm0
15462 ; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15463 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
15464 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15465 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
15466 ; AVX2-FCP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15467 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15468 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm13[2],ymm14[2],ymm13[3],ymm14[3],ymm13[6],ymm14[6],ymm13[7],ymm14[7]
15469 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15470 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15471 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
15472 ; AVX2-FCP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15473 ; AVX2-FCP-NEXT: vmovups %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15474 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm1
15475 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm7[2,2,2,2]
15476 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
15477 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
15478 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15479 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15480 ; AVX2-FCP-NEXT: vbroadcastss 1016(%rdi), %ymm0
15481 ; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15482 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
15483 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15484 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
15485 ; AVX2-FCP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15486 ; AVX2-FCP-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15487 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm2 = ymm8[2],ymm12[2],ymm8[3],ymm12[3],ymm8[6],ymm12[6],ymm8[7],ymm12[7]
15488 ; AVX2-FCP-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15489 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15490 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm14 # 32-byte Folded Reload
15491 ; AVX2-FCP-NEXT: # ymm14 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15492 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm1
15493 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm14[2,2,2,2]
15494 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
15495 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
15496 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15497 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15498 ; AVX2-FCP-NEXT: vbroadcastss 1272(%rdi), %ymm0
15499 ; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15500 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
15501 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15502 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm12 # 32-byte Folded Reload
15503 ; AVX2-FCP-NEXT: # ymm12 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15504 ; AVX2-FCP-NEXT: vunpckhps {{.*#+}} ymm13 = ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[6],ymm5[6],ymm4[7],ymm5[7]
15505 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15506 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm11 # 32-byte Folded Reload
15507 ; AVX2-FCP-NEXT: # ymm11 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15508 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm13, %xmm1
15509 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm11[2,2,2,2]
15510 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
15511 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6,7]
15512 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15513 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15514 ; AVX2-FCP-NEXT: vbroadcastss 1528(%rdi), %ymm0
15515 ; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15516 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
15517 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15518 ; AVX2-FCP-NEXT: vunpckhps (%rsp), %ymm1, %ymm10 # 32-byte Folded Reload
15519 ; AVX2-FCP-NEXT: # ymm10 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15520 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm9 # 32-byte Folded Reload
15521 ; AVX2-FCP-NEXT: # ymm9 = ymm3[2],mem[2],ymm3[3],mem[3],ymm3[6],mem[6],ymm3[7],mem[7]
15522 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15523 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm8 # 32-byte Folded Reload
15524 ; AVX2-FCP-NEXT: # ymm8 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15525 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm9, %xmm1
15526 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm8[2,2,2,2]
15527 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
15528 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5],ymm0[6,7]
15529 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15530 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15531 ; AVX2-FCP-NEXT: vbroadcastss 1784(%rdi), %ymm0
15532 ; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15533 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,2,3,4,5,6],ymm0[7]
15534 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15535 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm6 # 32-byte Folded Reload
15536 ; AVX2-FCP-NEXT: # ymm6 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15537 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15538 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm7 # 32-byte Folded Reload
15539 ; AVX2-FCP-NEXT: # ymm7 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15540 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15541 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm5 # 32-byte Folded Reload
15542 ; AVX2-FCP-NEXT: # ymm5 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
15543 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm7, %xmm1
15544 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm2 = ymm5[2,2,2,2]
15545 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
15546 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
15547 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15548 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15549 ; AVX2-FCP-NEXT: vbroadcastss 2040(%rdi), %ymm0
15550 ; AVX2-FCP-NEXT: vblendps $127, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
15551 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,6],ymm0[7]
15552 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15553 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
15554 ; AVX2-FCP-NEXT: # ymm4 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
15555 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15556 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm3 # 32-byte Folded Reload
15557 ; AVX2-FCP-NEXT: # ymm3 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
15558 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15559 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
15560 ; AVX2-FCP-NEXT: # ymm2 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
15561 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm3, %xmm0
15562 ; AVX2-FCP-NEXT: vpermpd {{.*#+}} ymm15 = ymm2[2,2,2,2]
15563 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3]
15564 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
15565 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15566 ; AVX2-FCP-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
15567 ; AVX2-FCP-NEXT: vbroadcastss 220(%rdi), %ymm0
15568 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15569 ; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
15570 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15571 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
15572 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15573 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1
15574 ; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
15575 ; AVX2-FCP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7]
15576 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm15, %xmm15
15577 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
15578 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15579 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15580 ; AVX2-FCP-NEXT: vbroadcastss 476(%rdi), %ymm0
15581 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15582 ; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
15583 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15584 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
15585 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15586 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1
15587 ; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
15588 ; AVX2-FCP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7]
15589 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm15, %xmm15
15590 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
15591 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15592 ; AVX2-FCP-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15593 ; AVX2-FCP-NEXT: vbroadcastss 732(%rdi), %ymm0
15594 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15595 ; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
15596 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15597 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
15598 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15599 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm1, %xmm1
15600 ; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
15601 ; AVX2-FCP-NEXT: # ymm15 = mem[2,3,2,3,6,7,6,7]
15602 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm15, %xmm15
15603 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm15[0,1],xmm1[2,3]
15604 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15605 ; AVX2-FCP-NEXT: vbroadcastss 988(%rdi), %ymm0
15606 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15607 ; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
15608 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
15609 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
15610 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm1
15611 ; AVX2-FCP-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
15612 ; AVX2-FCP-NEXT: # ymm14 = mem[2,3,2,3,6,7,6,7]
15613 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm14, %xmm14
15614 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
15615 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15616 ; AVX2-FCP-NEXT: vbroadcastss 1244(%rdi), %ymm0
15617 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15618 ; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
15619 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm12[1],ymm0[1],ymm12[3],ymm0[3]
15620 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm1
15621 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm11 = ymm13[2,3,2,3,6,7,6,7]
15622 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm11, %xmm11
15623 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1],xmm1[2,3]
15624 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
15625 ; AVX2-FCP-NEXT: vbroadcastss 1500(%rdi), %ymm0
15626 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15627 ; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
15628 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm10[1],ymm0[1],ymm10[3],ymm0[3]
15629 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm8, %xmm8
15630 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm9 = ymm9[2,3,2,3,6,7,6,7]
15631 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm9, %xmm9
15632 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
15633 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm0[4,5,6,7]
15634 ; AVX2-FCP-NEXT: vbroadcastss 1756(%rdi), %ymm0
15635 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15636 ; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
15637 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm6[1],ymm0[1],ymm6[3],ymm0[3]
15638 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm5, %xmm5
15639 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm6 = ymm7[2,3,2,3,6,7,6,7]
15640 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm6, %xmm6
15641 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
15642 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm0[4,5,6,7]
15643 ; AVX2-FCP-NEXT: vbroadcastss 2012(%rdi), %ymm0
15644 ; AVX2-FCP-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
15645 ; AVX2-FCP-NEXT: # ymm0 = ymm0[2],mem[2],ymm0[3],mem[3],ymm0[6],mem[6],ymm0[7],mem[7]
15646 ; AVX2-FCP-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm4[1],ymm0[1],ymm4[3],ymm0[3]
15647 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm2, %xmm2
15648 ; AVX2-FCP-NEXT: vshufps {{.*#+}} ymm3 = ymm3[2,3,2,3,6,7,6,7]
15649 ; AVX2-FCP-NEXT: vextractf128 $1, %ymm3, %xmm3
15650 ; AVX2-FCP-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
15651 ; AVX2-FCP-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
15652 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15653 ; AVX2-FCP-NEXT: vmovaps %ymm2, 192(%rsi)
15654 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15655 ; AVX2-FCP-NEXT: vmovaps %ymm2, 128(%rsi)
15656 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15657 ; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rsi)
15658 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15659 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rsi)
15660 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15661 ; AVX2-FCP-NEXT: vmovaps %ymm2, 224(%rsi)
15662 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15663 ; AVX2-FCP-NEXT: vmovaps %ymm2, 160(%rsi)
15664 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15665 ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rsi)
15666 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15667 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rsi)
15668 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15669 ; AVX2-FCP-NEXT: vmovaps %ymm2, 192(%rdx)
15670 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15671 ; AVX2-FCP-NEXT: vmovaps %ymm2, 128(%rdx)
15672 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15673 ; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rdx)
15674 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15675 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rdx)
15676 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15677 ; AVX2-FCP-NEXT: vmovaps %ymm2, 224(%rdx)
15678 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15679 ; AVX2-FCP-NEXT: vmovaps %ymm2, 160(%rdx)
15680 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15681 ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rdx)
15682 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15683 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rdx)
15684 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15685 ; AVX2-FCP-NEXT: vmovaps %ymm2, 192(%rcx)
15686 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15687 ; AVX2-FCP-NEXT: vmovaps %ymm2, 128(%rcx)
15688 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15689 ; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rcx)
15690 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15691 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rcx)
15692 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15693 ; AVX2-FCP-NEXT: vmovaps %ymm2, 224(%rcx)
15694 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15695 ; AVX2-FCP-NEXT: vmovaps %ymm2, 160(%rcx)
15696 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15697 ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rcx)
15698 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15699 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rcx)
15700 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15701 ; AVX2-FCP-NEXT: vmovaps %ymm2, 192(%r8)
15702 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15703 ; AVX2-FCP-NEXT: vmovaps %ymm2, 128(%r8)
15704 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15705 ; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%r8)
15706 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15707 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%r8)
15708 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15709 ; AVX2-FCP-NEXT: vmovaps %ymm2, 224(%r8)
15710 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15711 ; AVX2-FCP-NEXT: vmovaps %ymm2, 160(%r8)
15712 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15713 ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%r8)
15714 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15715 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%r8)
15716 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15717 ; AVX2-FCP-NEXT: vmovaps %ymm2, 224(%r9)
15718 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15719 ; AVX2-FCP-NEXT: vmovaps %ymm2, 192(%r9)
15720 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15721 ; AVX2-FCP-NEXT: vmovaps %ymm2, 160(%r9)
15722 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15723 ; AVX2-FCP-NEXT: vmovaps %ymm2, 128(%r9)
15724 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15725 ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%r9)
15726 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15727 ; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%r9)
15728 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15729 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%r9)
15730 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15731 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%r9)
15732 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
15733 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15734 ; AVX2-FCP-NEXT: vmovaps %ymm2, 224(%rax)
15735 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15736 ; AVX2-FCP-NEXT: vmovaps %ymm2, 192(%rax)
15737 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15738 ; AVX2-FCP-NEXT: vmovaps %ymm2, 160(%rax)
15739 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15740 ; AVX2-FCP-NEXT: vmovaps %ymm2, 128(%rax)
15741 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15742 ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rax)
15743 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15744 ; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rax)
15745 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15746 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rax)
15747 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15748 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rax)
15749 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
15750 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
15751 ; AVX2-FCP-NEXT: vmovaps %ymm2, 224(%rax)
15752 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15753 ; AVX2-FCP-NEXT: vmovaps %ymm2, 192(%rax)
15754 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15755 ; AVX2-FCP-NEXT: vmovaps %ymm2, 160(%rax)
15756 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15757 ; AVX2-FCP-NEXT: vmovaps %ymm2, 128(%rax)
15758 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15759 ; AVX2-FCP-NEXT: vmovaps %ymm2, 96(%rax)
15760 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15761 ; AVX2-FCP-NEXT: vmovaps %ymm2, 64(%rax)
15762 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15763 ; AVX2-FCP-NEXT: vmovaps %ymm2, 32(%rax)
15764 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
15765 ; AVX2-FCP-NEXT: vmovaps %ymm2, (%rax)
15766 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
15767 ; AVX2-FCP-NEXT: vmovaps %ymm0, 224(%rax)
15768 ; AVX2-FCP-NEXT: vmovaps %ymm5, 192(%rax)
15769 ; AVX2-FCP-NEXT: vmovaps %ymm8, 160(%rax)
15770 ; AVX2-FCP-NEXT: vmovaps %ymm1, 128(%rax)
15771 ; AVX2-FCP-NEXT: vmovaps %ymm14, 96(%rax)
15772 ; AVX2-FCP-NEXT: vmovaps %ymm15, 64(%rax)
15773 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15774 ; AVX2-FCP-NEXT: vmovaps %ymm0, 32(%rax)
15775 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
15776 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rax)
15777 ; AVX2-FCP-NEXT: addq $3528, %rsp # imm = 0xDC8
15778 ; AVX2-FCP-NEXT: vzeroupper
15779 ; AVX2-FCP-NEXT: retq
15781 ; AVX512-LABEL: load_i32_stride8_vf64:
15783 ; AVX512-NEXT: subq $3144, %rsp # imm = 0xC48
15784 ; AVX512-NEXT: vmovdqa64 320(%rdi), %zmm11
15785 ; AVX512-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15786 ; AVX512-NEXT: vmovdqa64 448(%rdi), %zmm18
15787 ; AVX512-NEXT: vmovdqa64 1600(%rdi), %zmm31
15788 ; AVX512-NEXT: vmovaps 1536(%rdi), %zmm0
15789 ; AVX512-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill
15790 ; AVX512-NEXT: vmovdqa64 1728(%rdi), %zmm24
15791 ; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15792 ; AVX512-NEXT: vmovaps 1664(%rdi), %zmm0
15793 ; AVX512-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15794 ; AVX512-NEXT: vmovdqa64 1856(%rdi), %zmm21
15795 ; AVX512-NEXT: vmovdqa64 1792(%rdi), %zmm26
15796 ; AVX512-NEXT: vmovdqa64 1984(%rdi), %zmm22
15797 ; AVX512-NEXT: vmovdqa64 1920(%rdi), %zmm5
15798 ; AVX512-NEXT: vmovdqa64 1088(%rdi), %zmm14
15799 ; AVX512-NEXT: vmovdqa64 1024(%rdi), %zmm3
15800 ; AVX512-NEXT: vmovdqa64 1216(%rdi), %zmm19
15801 ; AVX512-NEXT: vmovdqa64 1152(%rdi), %zmm2
15802 ; AVX512-NEXT: vmovdqa64 1344(%rdi), %zmm13
15803 ; AVX512-NEXT: vmovdqa64 1280(%rdi), %zmm27
15804 ; AVX512-NEXT: vmovdqa64 1472(%rdi), %zmm20
15805 ; AVX512-NEXT: vmovdqa64 1408(%rdi), %zmm10
15806 ; AVX512-NEXT: vmovdqa64 576(%rdi), %zmm17
15807 ; AVX512-NEXT: vmovdqa64 512(%rdi), %zmm7
15808 ; AVX512-NEXT: vmovdqa64 704(%rdi), %zmm9
15809 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15810 ; AVX512-NEXT: vmovdqa64 640(%rdi), %zmm12
15811 ; AVX512-NEXT: vmovdqa64 832(%rdi), %zmm6
15812 ; AVX512-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15813 ; AVX512-NEXT: vmovdqa64 768(%rdi), %zmm28
15814 ; AVX512-NEXT: vmovdqa64 960(%rdi), %zmm23
15815 ; AVX512-NEXT: vmovdqa64 896(%rdi), %zmm4
15816 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
15817 ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
15818 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm1
15819 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm16
15820 ; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
15821 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4
15822 ; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm4
15823 ; AVX512-NEXT: movb $-64, %al
15824 ; AVX512-NEXT: kmovw %eax, %k1
15825 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
15826 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm1
15827 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm15
15828 ; AVX512-NEXT: vpermt2d %zmm9, %zmm0, %zmm1
15829 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm12
15830 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm9
15831 ; AVX512-NEXT: vpermt2d %zmm17, %zmm0, %zmm12
15832 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm25
15833 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
15834 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
15835 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15836 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1
15837 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm6
15838 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15839 ; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
15840 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4
15841 ; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm4
15842 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm29
15843 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
15844 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm1
15845 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm8
15846 ; AVX512-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
15847 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm30
15848 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm10
15849 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12
15850 ; AVX512-NEXT: vpermt2d %zmm14, %zmm0, %zmm12
15851 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm13
15852 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
15853 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
15854 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15855 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm14
15856 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15857 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1
15858 ; AVX512-NEXT: vpermt2d %zmm22, %zmm0, %zmm1
15859 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm17
15860 ; AVX512-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15861 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm4
15862 ; AVX512-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
15863 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
15864 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15865 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1
15866 ; AVX512-NEXT: vpermt2d %zmm24, %zmm0, %zmm1
15867 ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
15868 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12
15869 ; AVX512-NEXT: vmovdqa64 %zmm31, %zmm24
15870 ; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm12
15871 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
15872 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm5
15873 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15874 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
15875 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15876 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1
15877 ; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
15878 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm22
15879 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4
15880 ; AVX512-NEXT: vpermt2d %zmm11, %zmm0, %zmm4
15881 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
15882 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1
15883 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15884 ; AVX512-NEXT: vmovdqa64 192(%rdi), %zmm5
15885 ; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15886 ; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm1
15887 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm7
15888 ; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm19
15889 ; AVX512-NEXT: vpermi2d %zmm19, %zmm7, %zmm0
15890 ; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15891 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15892 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
15893 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15894 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
15895 ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
15896 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1
15897 ; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
15898 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4
15899 ; AVX512-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15900 ; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm4
15901 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
15902 ; AVX512-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15903 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1
15904 ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm31
15905 ; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm1
15906 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm12
15907 ; AVX512-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15908 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm30
15909 ; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm12
15910 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
15911 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
15912 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15913 ; AVX512-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15914 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm1
15915 ; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
15916 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4
15917 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
15918 ; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
15919 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
15920 ; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15921 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm1
15922 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
15923 ; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm1
15924 ; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15925 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm12
15926 ; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm12
15927 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
15928 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
15929 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15930 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm1
15931 ; AVX512-NEXT: vpermt2d %zmm17, %zmm0, %zmm1
15932 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm4
15933 ; AVX512-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
15934 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
15935 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm1
15936 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15937 ; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
15938 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12
15939 ; AVX512-NEXT: vpermt2d %zmm24, %zmm0, %zmm12
15940 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
15941 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
15942 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15943 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
15944 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1
15945 ; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
15946 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4
15947 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
15948 ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
15949 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
15950 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
15951 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm1
15952 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
15953 ; AVX512-NEXT: vpermt2d %zmm11, %zmm0, %zmm1
15954 ; AVX512-NEXT: vpermi2d %zmm19, %zmm7, %zmm0
15955 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
15956 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
15957 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15958 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
15959 ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
15960 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
15961 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1
15962 ; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
15963 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4
15964 ; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm4
15965 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
15966 ; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm8
15967 ; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm10
15968 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7]
15969 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
15970 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15971 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm1
15972 ; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
15973 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4
15974 ; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
15975 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
15976 ; AVX512-NEXT: vpermt2d %zmm13, %zmm0, %zmm15
15977 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm16
15978 ; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm9
15979 ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm29
15980 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7]
15981 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
15982 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15983 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
15984 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1
15985 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
15986 ; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
15987 ; AVX512-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15988 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm4
15989 ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm7
15990 ; AVX512-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
15991 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
15992 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
15993 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1
15994 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm25
15995 ; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
15996 ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
15997 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12
15998 ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm15
15999 ; AVX512-NEXT: vpermt2d %zmm24, %zmm0, %zmm12
16000 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16001 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16002 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16003 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm13
16004 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1
16005 ; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
16006 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4
16007 ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
16008 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16009 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm6
16010 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm1
16011 ; AVX512-NEXT: vpermt2d %zmm11, %zmm0, %zmm1
16012 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm24
16013 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
16014 ; AVX512-NEXT: vpermi2d %zmm19, %zmm14, %zmm0
16015 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16016 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
16017 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16018 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
16019 ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16020 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1
16021 ; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
16022 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4
16023 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16024 ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
16025 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16026 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16027 ; AVX512-NEXT: vmovdqa64 %zmm31, %zmm21
16028 ; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm1
16029 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
16030 ; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
16031 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16032 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16033 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16034 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
16035 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm1
16036 ; AVX512-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16037 ; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
16038 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4
16039 ; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
16040 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm31
16041 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16042 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16043 ; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
16044 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
16045 ; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
16046 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16047 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16048 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16049 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1
16050 ; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
16051 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm4
16052 ; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm4
16053 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm26
16054 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16055 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1
16056 ; AVX512-NEXT: vmovdqa64 %zmm25, %zmm9
16057 ; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
16058 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm12
16059 ; AVX512-NEXT: vpermt2d %zmm15, %zmm0, %zmm12
16060 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16061 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16062 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16063 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm1
16064 ; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
16065 ; AVX512-NEXT: vmovdqa64 %zmm18, %zmm25
16066 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4
16067 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16068 ; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
16069 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16070 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm1
16071 ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm18
16072 ; AVX512-NEXT: vpermt2d %zmm24, %zmm0, %zmm1
16073 ; AVX512-NEXT: vpermi2d %zmm19, %zmm14, %zmm0
16074 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm24
16075 ; AVX512-NEXT: vmovdqa64 %zmm14, %zmm13
16076 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16077 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
16078 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16079 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
16080 ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16081 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1
16082 ; AVX512-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16083 ; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
16084 ; AVX512-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16085 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm4
16086 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3
16087 ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
16088 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16089 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
16090 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1
16091 ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm6
16092 ; AVX512-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16093 ; AVX512-NEXT: vpermt2d %zmm21, %zmm0, %zmm1
16094 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
16095 ; AVX512-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16096 ; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
16097 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16098 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16099 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16100 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm1
16101 ; AVX512-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
16102 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4
16103 ; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm4
16104 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16105 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16106 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1
16107 ; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
16108 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16109 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12
16110 ; AVX512-NEXT: vmovdqa64 %zmm29, %zmm11
16111 ; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
16112 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16113 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16114 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16115 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
16116 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1
16117 ; AVX512-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
16118 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm31
16119 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
16120 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm4
16121 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm19
16122 ; AVX512-NEXT: vpermt2d %zmm26, %zmm0, %zmm4
16123 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16124 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16125 ; AVX512-NEXT: vpermt2d %zmm9, %zmm0, %zmm1
16126 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm26
16127 ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload
16128 ; AVX512-NEXT: vpermt2d %zmm15, %zmm0, %zmm12
16129 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm29
16130 ; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16131 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16132 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16133 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16134 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
16135 ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm1
16136 ; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
16137 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4
16138 ; AVX512-NEXT: vmovdqa64 %zmm5, %zmm14
16139 ; AVX512-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
16140 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16141 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
16142 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1
16143 ; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
16144 ; AVX512-NEXT: vmovdqa64 %zmm13, %zmm5
16145 ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm13
16146 ; AVX512-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16147 ; AVX512-NEXT: vpermi2d %zmm24, %zmm5, %zmm0
16148 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16149 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
16150 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16151 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
16152 ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16153 ; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm17
16154 ; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm27
16155 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1}
16156 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm1
16157 ; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm1
16158 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
16159 ; AVX512-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
16160 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16161 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1
16162 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16163 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
16164 ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm1
16165 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16166 ; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
16167 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm4
16168 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
16169 ; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm4
16170 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16171 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm1
16172 ; AVX512-NEXT: vmovdqa64 %zmm8, %zmm15
16173 ; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
16174 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm8
16175 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm12
16176 ; AVX512-NEXT: vpermt2d %zmm11, %zmm0, %zmm12
16177 ; AVX512-NEXT: vmovdqa64 %zmm11, %zmm2
16178 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16179 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16180 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16181 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm1
16182 ; AVX512-NEXT: vpermt2d %zmm31, %zmm0, %zmm1
16183 ; AVX512-NEXT: vmovdqa64 %zmm31, %zmm16
16184 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm4
16185 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm27
16186 ; AVX512-NEXT: vpermt2d %zmm19, %zmm0, %zmm4
16187 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm20
16188 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16189 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
16190 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm1
16191 ; AVX512-NEXT: vpermt2d %zmm26, %zmm0, %zmm1
16192 ; AVX512-NEXT: vmovdqa64 %zmm26, %zmm17
16193 ; AVX512-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
16194 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12
16195 ; AVX512-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
16196 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16197 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16198 ; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16199 ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm1
16200 ; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
16201 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm4
16202 ; AVX512-NEXT: vpermt2d %zmm14, %zmm0, %zmm4
16203 ; AVX512-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16204 ; AVX512-NEXT: vmovdqa64 %zmm9, %zmm1
16205 ; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
16206 ; AVX512-NEXT: vpermi2d %zmm13, %zmm5, %zmm0
16207 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16208 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
16209 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16210 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
16211 ; AVX512-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16212 ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm4
16213 ; AVX512-NEXT: vmovdqa64 %zmm24, %zmm30
16214 ; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm30
16215 ; AVX512-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
16216 ; AVX512-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16217 ; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm4
16218 ; AVX512-NEXT: vmovdqa64 %zmm4, %zmm19
16219 ; AVX512-NEXT: vmovdqa64 %zmm28, %zmm11
16220 ; AVX512-NEXT: vpermt2d %zmm7, %zmm1, %zmm11
16221 ; AVX512-NEXT: vpermt2d %zmm7, %zmm0, %zmm28
16222 ; AVX512-NEXT: vmovdqa64 %zmm15, %zmm9
16223 ; AVX512-NEXT: vpermt2d %zmm8, %zmm1, %zmm9
16224 ; AVX512-NEXT: vpermt2d %zmm8, %zmm0, %zmm15
16225 ; AVX512-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16226 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16227 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm8
16228 ; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm8
16229 ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
16230 ; AVX512-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16231 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm31
16232 ; AVX512-NEXT: vpermt2d %zmm16, %zmm1, %zmm31
16233 ; AVX512-NEXT: vpermt2d %zmm16, %zmm0, %zmm23
16234 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm16
16235 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm29
16236 ; AVX512-NEXT: vpermt2d %zmm20, %zmm1, %zmm29
16237 ; AVX512-NEXT: vpermt2d %zmm20, %zmm0, %zmm27
16238 ; AVX512-NEXT: vmovdqa64 %zmm27, %zmm26
16239 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm2
16240 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm5
16241 ; AVX512-NEXT: vpermt2d %zmm17, %zmm1, %zmm5
16242 ; AVX512-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
16243 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16244 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm4
16245 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16246 ; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm4
16247 ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm6
16248 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm15
16249 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16250 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm23
16251 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16252 ; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm23
16253 ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
16254 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm17
16255 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16256 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm24
16257 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16258 ; AVX512-NEXT: vpermt2d %zmm2, %zmm1, %zmm24
16259 ; AVX512-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
16260 ; AVX512-NEXT: vmovdqa64 %zmm3, %zmm27
16261 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
16262 ; AVX512-NEXT: vmovdqa64 %zmm12, %zmm13
16263 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16264 ; AVX512-NEXT: vpermt2d %zmm3, %zmm1, %zmm13
16265 ; AVX512-NEXT: vpermt2d %zmm3, %zmm0, %zmm12
16266 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
16267 ; AVX512-NEXT: vmovdqa64 %zmm10, %zmm3
16268 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16269 ; AVX512-NEXT: vpermt2d %zmm6, %zmm1, %zmm3
16270 ; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm10
16271 ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm6
16272 ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm20
16273 ; AVX512-NEXT: vpermt2d %zmm25, %zmm1, %zmm20
16274 ; AVX512-NEXT: vpermt2d %zmm25, %zmm0, %zmm6
16275 ; AVX512-NEXT: vmovdqa64 %zmm6, %zmm21
16276 ; AVX512-NEXT: vmovdqa64 %zmm22, %zmm25
16277 ; AVX512-NEXT: vpermt2d %zmm14, %zmm1, %zmm25
16278 ; AVX512-NEXT: vpermt2d %zmm14, %zmm0, %zmm22
16279 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
16280 ; AVX512-NEXT: vmovdqa64 %zmm7, %zmm14
16281 ; AVX512-NEXT: vpermt2d %zmm18, %zmm1, %zmm14
16282 ; AVX512-NEXT: vpermt2d %zmm18, %zmm0, %zmm7
16283 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16284 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16285 ; AVX512-NEXT: vpermi2d %zmm6, %zmm2, %zmm1
16286 ; AVX512-NEXT: vpermt2d %zmm6, %zmm0, %zmm2
16287 ; AVX512-NEXT: vmovdqa64 %zmm2, %zmm6
16288 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
16289 ; AVX512-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1}
16290 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
16291 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7]
16292 ; AVX512-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1}
16293 ; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
16294 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7]
16295 ; AVX512-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1}
16296 ; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3
16297 ; AVX512-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1}
16298 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
16299 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1
16300 ; AVX512-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1}
16301 ; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
16302 ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
16303 ; AVX512-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
16304 ; AVX512-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4
16305 ; AVX512-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1}
16306 ; AVX512-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
16307 ; AVX512-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7]
16308 ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5
16309 ; AVX512-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1}
16310 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7]
16311 ; AVX512-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9
16312 ; AVX512-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1}
16313 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7]
16314 ; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7
16315 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16316 ; AVX512-NEXT: vmovaps %zmm8, 192(%rsi)
16317 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16318 ; AVX512-NEXT: vmovaps %zmm8, 128(%rsi)
16319 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16320 ; AVX512-NEXT: vmovaps %zmm8, 64(%rsi)
16321 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16322 ; AVX512-NEXT: vmovaps %zmm6, (%rsi)
16323 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16324 ; AVX512-NEXT: vmovaps %zmm6, 192(%rdx)
16325 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16326 ; AVX512-NEXT: vmovaps %zmm6, (%rdx)
16327 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16328 ; AVX512-NEXT: vmovaps %zmm6, 64(%rdx)
16329 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16330 ; AVX512-NEXT: vmovaps %zmm6, 128(%rdx)
16331 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16332 ; AVX512-NEXT: vmovaps %zmm6, 192(%rcx)
16333 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16334 ; AVX512-NEXT: vmovaps %zmm6, (%rcx)
16335 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16336 ; AVX512-NEXT: vmovaps %zmm6, 64(%rcx)
16337 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16338 ; AVX512-NEXT: vmovaps %zmm6, 128(%rcx)
16339 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16340 ; AVX512-NEXT: vmovaps %zmm6, 192(%r8)
16341 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16342 ; AVX512-NEXT: vmovaps %zmm6, (%r8)
16343 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16344 ; AVX512-NEXT: vmovaps %zmm6, 64(%r8)
16345 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16346 ; AVX512-NEXT: vmovaps %zmm6, 128(%r8)
16347 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16348 ; AVX512-NEXT: vmovaps %zmm6, 192(%r9)
16349 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16350 ; AVX512-NEXT: vmovaps %zmm6, (%r9)
16351 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16352 ; AVX512-NEXT: vmovaps %zmm6, 64(%r9)
16353 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16354 ; AVX512-NEXT: vmovaps %zmm6, 128(%r9)
16355 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
16356 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16357 ; AVX512-NEXT: vmovaps %zmm6, 192(%rax)
16358 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16359 ; AVX512-NEXT: vmovaps %zmm6, (%rax)
16360 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16361 ; AVX512-NEXT: vmovaps %zmm6, 64(%rax)
16362 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16363 ; AVX512-NEXT: vmovaps %zmm6, 128(%rax)
16364 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
16365 ; AVX512-NEXT: vmovdqa64 %zmm3, 192(%rax)
16366 ; AVX512-NEXT: vmovdqa64 %zmm1, (%rax)
16367 ; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rax)
16368 ; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rax)
16369 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
16370 ; AVX512-NEXT: vmovdqa64 %zmm9, 128(%rax)
16371 ; AVX512-NEXT: vmovdqa64 %zmm5, 192(%rax)
16372 ; AVX512-NEXT: vmovdqa64 %zmm7, (%rax)
16373 ; AVX512-NEXT: vmovdqa64 %zmm4, 64(%rax)
16374 ; AVX512-NEXT: addq $3144, %rsp # imm = 0xC48
16375 ; AVX512-NEXT: vzeroupper
16376 ; AVX512-NEXT: retq
16378 ; AVX512-FCP-LABEL: load_i32_stride8_vf64:
16379 ; AVX512-FCP: # %bb.0:
16380 ; AVX512-FCP-NEXT: subq $3144, %rsp # imm = 0xC48
16381 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %zmm11
16382 ; AVX512-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16383 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18
16384 ; AVX512-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31
16385 ; AVX512-FCP-NEXT: vmovaps 1536(%rdi), %zmm0
16386 ; AVX512-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill
16387 ; AVX512-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm24
16388 ; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16389 ; AVX512-FCP-NEXT: vmovaps 1664(%rdi), %zmm0
16390 ; AVX512-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16391 ; AVX512-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21
16392 ; AVX512-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26
16393 ; AVX512-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm22
16394 ; AVX512-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5
16395 ; AVX512-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14
16396 ; AVX512-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3
16397 ; AVX512-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19
16398 ; AVX512-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2
16399 ; AVX512-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm13
16400 ; AVX512-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27
16401 ; AVX512-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20
16402 ; AVX512-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10
16403 ; AVX512-FCP-NEXT: vmovdqa64 576(%rdi), %zmm17
16404 ; AVX512-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7
16405 ; AVX512-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9
16406 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16407 ; AVX512-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12
16408 ; AVX512-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6
16409 ; AVX512-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16410 ; AVX512-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28
16411 ; AVX512-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23
16412 ; AVX512-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4
16413 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
16414 ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16415 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm1
16416 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm16
16417 ; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
16418 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
16419 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4
16420 ; AVX512-FCP-NEXT: movb $-64, %al
16421 ; AVX512-FCP-NEXT: kmovw %eax, %k1
16422 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16423 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm1
16424 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm15
16425 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1
16426 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm12
16427 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm9
16428 ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm12
16429 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm25
16430 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16431 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16432 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16433 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1
16434 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm6
16435 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16436 ; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
16437 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4
16438 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm4
16439 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm29
16440 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16441 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm1
16442 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
16443 ; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
16444 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm30
16445 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
16446 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
16447 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm12
16448 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm13
16449 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16450 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16451 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16452 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm14
16453 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16454 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
16455 ; AVX512-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm1
16456 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm17
16457 ; AVX512-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16458 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4
16459 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
16460 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16461 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16462 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
16463 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1
16464 ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
16465 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
16466 ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm24
16467 ; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12
16468 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16469 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5
16470 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16471 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16472 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16473 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
16474 ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
16475 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22
16476 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
16477 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm4
16478 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16479 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
16480 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16481 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5
16482 ; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16483 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm1
16484 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm7
16485 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %zmm19
16486 ; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0
16487 ; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16488 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16489 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
16490 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16491 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
16492 ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16493 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1
16494 ; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
16495 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4
16496 ; AVX512-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16497 ; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4
16498 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16499 ; AVX512-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16500 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
16501 ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm31
16502 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1
16503 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm12
16504 ; AVX512-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16505 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm30
16506 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12
16507 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16508 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16509 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16510 ; AVX512-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16511 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1
16512 ; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
16513 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
16514 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16515 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
16516 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16517 ; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16518 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm1
16519 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
16520 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1
16521 ; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16522 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm12
16523 ; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12
16524 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16525 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16526 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16527 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm1
16528 ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm1
16529 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4
16530 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
16531 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16532 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
16533 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16534 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
16535 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
16536 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12
16537 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16538 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16539 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16540 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16541 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1
16542 ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
16543 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
16544 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16545 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
16546 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16547 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
16548 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm1
16549 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
16550 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1
16551 ; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0
16552 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16553 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
16554 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16555 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
16556 ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16557 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
16558 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1
16559 ; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
16560 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4
16561 ; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4
16562 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16563 ; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm8
16564 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm10
16565 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7]
16566 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16567 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16568 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm1
16569 ; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
16570 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
16571 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
16572 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16573 ; AVX512-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm15
16574 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm16
16575 ; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm9
16576 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm29
16577 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7]
16578 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16579 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16580 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16581 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
16582 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
16583 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
16584 ; AVX512-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16585 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4
16586 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm7
16587 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
16588 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16589 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
16590 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1
16591 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm25
16592 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
16593 ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
16594 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
16595 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm15
16596 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12
16597 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16598 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16599 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16600 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
16601 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1
16602 ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
16603 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
16604 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
16605 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16606 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm6
16607 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm1
16608 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1
16609 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm24
16610 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
16611 ; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0
16612 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16613 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
16614 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16615 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
16616 ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16617 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1
16618 ; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
16619 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4
16620 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16621 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
16622 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16623 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16624 ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm21
16625 ; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1
16626 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
16627 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
16628 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16629 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16630 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16631 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
16632 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm1
16633 ; AVX512-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16634 ; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
16635 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
16636 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
16637 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm31
16638 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16639 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16640 ; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
16641 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
16642 ; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
16643 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16644 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16645 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16646 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
16647 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
16648 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm4
16649 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4
16650 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm26
16651 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16652 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1
16653 ; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm9
16654 ; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
16655 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
16656 ; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12
16657 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16658 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16659 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16660 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm1
16661 ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
16662 ; AVX512-FCP-NEXT: vmovdqa64 %zmm18, %zmm25
16663 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
16664 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
16665 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
16666 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16667 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm1
16668 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm18
16669 ; AVX512-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1
16670 ; AVX512-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0
16671 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm24
16672 ; AVX512-FCP-NEXT: vmovdqa64 %zmm14, %zmm13
16673 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16674 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
16675 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16676 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
16677 ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16678 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm1
16679 ; AVX512-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16680 ; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
16681 ; AVX512-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16682 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm4
16683 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm3
16684 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
16685 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16686 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
16687 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1
16688 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm6
16689 ; AVX512-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16690 ; AVX512-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1
16691 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
16692 ; AVX512-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16693 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
16694 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16695 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16696 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16697 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm1
16698 ; AVX512-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
16699 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
16700 ; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm4
16701 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16702 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16703 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
16704 ; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
16705 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16706 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
16707 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm11
16708 ; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
16709 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16710 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16711 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16712 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
16713 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1
16714 ; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
16715 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm31
16716 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
16717 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm4
16718 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm19
16719 ; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm4
16720 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16721 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
16722 ; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1
16723 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm26
16724 ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload
16725 ; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12
16726 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm29
16727 ; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16728 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16729 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16730 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16731 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
16732 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm1
16733 ; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
16734 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
16735 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm14
16736 ; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
16737 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16738 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
16739 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1
16740 ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
16741 ; AVX512-FCP-NEXT: vmovdqa64 %zmm13, %zmm5
16742 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm13
16743 ; AVX512-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16744 ; AVX512-FCP-NEXT: vpermi2d %zmm24, %zmm5, %zmm0
16745 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16746 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
16747 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16748 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
16749 ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16750 ; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm17
16751 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm27
16752 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1}
16753 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm1
16754 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1
16755 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
16756 ; AVX512-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
16757 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16758 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1
16759 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16760 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
16761 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm1
16762 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16763 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
16764 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
16765 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
16766 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4
16767 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16768 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
16769 ; AVX512-FCP-NEXT: vmovdqa64 %zmm8, %zmm15
16770 ; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
16771 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm8
16772 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
16773 ; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm12
16774 ; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm2
16775 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16776 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16777 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16778 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm1
16779 ; AVX512-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1
16780 ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm16
16781 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm4
16782 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm27
16783 ; AVX512-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm4
16784 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm20
16785 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16786 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
16787 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm1
16788 ; AVX512-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm1
16789 ; AVX512-FCP-NEXT: vmovdqa64 %zmm26, %zmm17
16790 ; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
16791 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12
16792 ; AVX512-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
16793 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
16794 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
16795 ; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16796 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm1
16797 ; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
16798 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
16799 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm4
16800 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
16801 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm1
16802 ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
16803 ; AVX512-FCP-NEXT: vpermi2d %zmm13, %zmm5, %zmm0
16804 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16805 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
16806 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16807 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
16808 ; AVX512-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16809 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm4
16810 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, %zmm30
16811 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm30
16812 ; AVX512-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
16813 ; AVX512-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16814 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm4
16815 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, %zmm19
16816 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm11
16817 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm1, %zmm11
16818 ; AVX512-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm28
16819 ; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm9
16820 ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm9
16821 ; AVX512-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm15
16822 ; AVX512-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16823 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16824 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm8
16825 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm8
16826 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
16827 ; AVX512-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16828 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm31
16829 ; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm31
16830 ; AVX512-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm23
16831 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm16
16832 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm29
16833 ; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm1, %zmm29
16834 ; AVX512-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm27
16835 ; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm26
16836 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
16837 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm5
16838 ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm1, %zmm5
16839 ; AVX512-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
16840 ; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16841 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm4
16842 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16843 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm4
16844 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm6
16845 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm15
16846 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16847 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm23
16848 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16849 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm23
16850 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
16851 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm17
16852 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16853 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm24
16854 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16855 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm24
16856 ; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
16857 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm27
16858 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
16859 ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm13
16860 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
16861 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm13
16862 ; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm12
16863 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
16864 ; AVX512-FCP-NEXT: vmovdqa64 %zmm10, %zmm3
16865 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16866 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm3
16867 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm10
16868 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm6
16869 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm20
16870 ; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm1, %zmm20
16871 ; AVX512-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm6
16872 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm21
16873 ; AVX512-FCP-NEXT: vmovdqa64 %zmm22, %zmm25
16874 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm25
16875 ; AVX512-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm22
16876 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
16877 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm14
16878 ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm14
16879 ; AVX512-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm7
16880 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16881 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16882 ; AVX512-FCP-NEXT: vpermi2d %zmm6, %zmm2, %zmm1
16883 ; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm2
16884 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm6
16885 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
16886 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1}
16887 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
16888 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7]
16889 ; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1}
16890 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
16891 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7]
16892 ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1}
16893 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3
16894 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1}
16895 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
16896 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1
16897 ; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1}
16898 ; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
16899 ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
16900 ; AVX512-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
16901 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4
16902 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1}
16903 ; AVX512-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
16904 ; AVX512-FCP-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7]
16905 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5
16906 ; AVX512-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1}
16907 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7]
16908 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9
16909 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1}
16910 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7]
16911 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7
16912 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16913 ; AVX512-FCP-NEXT: vmovaps %zmm8, 192(%rsi)
16914 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16915 ; AVX512-FCP-NEXT: vmovaps %zmm8, 128(%rsi)
16916 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
16917 ; AVX512-FCP-NEXT: vmovaps %zmm8, 64(%rsi)
16918 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16919 ; AVX512-FCP-NEXT: vmovaps %zmm6, (%rsi)
16920 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16921 ; AVX512-FCP-NEXT: vmovaps %zmm6, 192(%rdx)
16922 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16923 ; AVX512-FCP-NEXT: vmovaps %zmm6, (%rdx)
16924 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16925 ; AVX512-FCP-NEXT: vmovaps %zmm6, 64(%rdx)
16926 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16927 ; AVX512-FCP-NEXT: vmovaps %zmm6, 128(%rdx)
16928 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16929 ; AVX512-FCP-NEXT: vmovaps %zmm6, 192(%rcx)
16930 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16931 ; AVX512-FCP-NEXT: vmovaps %zmm6, (%rcx)
16932 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16933 ; AVX512-FCP-NEXT: vmovaps %zmm6, 64(%rcx)
16934 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16935 ; AVX512-FCP-NEXT: vmovaps %zmm6, 128(%rcx)
16936 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16937 ; AVX512-FCP-NEXT: vmovaps %zmm6, 192(%r8)
16938 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16939 ; AVX512-FCP-NEXT: vmovaps %zmm6, (%r8)
16940 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16941 ; AVX512-FCP-NEXT: vmovaps %zmm6, 64(%r8)
16942 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16943 ; AVX512-FCP-NEXT: vmovaps %zmm6, 128(%r8)
16944 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16945 ; AVX512-FCP-NEXT: vmovaps %zmm6, 192(%r9)
16946 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16947 ; AVX512-FCP-NEXT: vmovaps %zmm6, (%r9)
16948 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16949 ; AVX512-FCP-NEXT: vmovaps %zmm6, 64(%r9)
16950 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16951 ; AVX512-FCP-NEXT: vmovaps %zmm6, 128(%r9)
16952 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
16953 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16954 ; AVX512-FCP-NEXT: vmovaps %zmm6, 192(%rax)
16955 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16956 ; AVX512-FCP-NEXT: vmovaps %zmm6, (%rax)
16957 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16958 ; AVX512-FCP-NEXT: vmovaps %zmm6, 64(%rax)
16959 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
16960 ; AVX512-FCP-NEXT: vmovaps %zmm6, 128(%rax)
16961 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
16962 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax)
16963 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
16964 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax)
16965 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax)
16966 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
16967 ; AVX512-FCP-NEXT: vmovdqa64 %zmm9, 128(%rax)
16968 ; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 192(%rax)
16969 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rax)
16970 ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax)
16971 ; AVX512-FCP-NEXT: addq $3144, %rsp # imm = 0xC48
16972 ; AVX512-FCP-NEXT: vzeroupper
16973 ; AVX512-FCP-NEXT: retq
16975 ; AVX512DQ-LABEL: load_i32_stride8_vf64:
16976 ; AVX512DQ: # %bb.0:
16977 ; AVX512DQ-NEXT: subq $3144, %rsp # imm = 0xC48
16978 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %zmm11
16979 ; AVX512DQ-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16980 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %zmm18
16981 ; AVX512DQ-NEXT: vmovdqa64 1600(%rdi), %zmm31
16982 ; AVX512DQ-NEXT: vmovaps 1536(%rdi), %zmm0
16983 ; AVX512DQ-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill
16984 ; AVX512DQ-NEXT: vmovdqa64 1728(%rdi), %zmm24
16985 ; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16986 ; AVX512DQ-NEXT: vmovaps 1664(%rdi), %zmm0
16987 ; AVX512DQ-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16988 ; AVX512DQ-NEXT: vmovdqa64 1856(%rdi), %zmm21
16989 ; AVX512DQ-NEXT: vmovdqa64 1792(%rdi), %zmm26
16990 ; AVX512DQ-NEXT: vmovdqa64 1984(%rdi), %zmm22
16991 ; AVX512DQ-NEXT: vmovdqa64 1920(%rdi), %zmm5
16992 ; AVX512DQ-NEXT: vmovdqa64 1088(%rdi), %zmm14
16993 ; AVX512DQ-NEXT: vmovdqa64 1024(%rdi), %zmm3
16994 ; AVX512DQ-NEXT: vmovdqa64 1216(%rdi), %zmm19
16995 ; AVX512DQ-NEXT: vmovdqa64 1152(%rdi), %zmm2
16996 ; AVX512DQ-NEXT: vmovdqa64 1344(%rdi), %zmm13
16997 ; AVX512DQ-NEXT: vmovdqa64 1280(%rdi), %zmm27
16998 ; AVX512DQ-NEXT: vmovdqa64 1472(%rdi), %zmm20
16999 ; AVX512DQ-NEXT: vmovdqa64 1408(%rdi), %zmm10
17000 ; AVX512DQ-NEXT: vmovdqa64 576(%rdi), %zmm17
17001 ; AVX512DQ-NEXT: vmovdqa64 512(%rdi), %zmm7
17002 ; AVX512DQ-NEXT: vmovdqa64 704(%rdi), %zmm9
17003 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17004 ; AVX512DQ-NEXT: vmovdqa64 640(%rdi), %zmm12
17005 ; AVX512DQ-NEXT: vmovdqa64 832(%rdi), %zmm6
17006 ; AVX512DQ-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17007 ; AVX512DQ-NEXT: vmovdqa64 768(%rdi), %zmm28
17008 ; AVX512DQ-NEXT: vmovdqa64 960(%rdi), %zmm23
17009 ; AVX512DQ-NEXT: vmovdqa64 896(%rdi), %zmm4
17010 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
17011 ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17012 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm1
17013 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm16
17014 ; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
17015 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4
17016 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm4
17017 ; AVX512DQ-NEXT: movb $-64, %al
17018 ; AVX512DQ-NEXT: kmovw %eax, %k1
17019 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17020 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm1
17021 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm15
17022 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm0, %zmm1
17023 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm12
17024 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm9
17025 ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm0, %zmm12
17026 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm25
17027 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17028 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17029 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17030 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1
17031 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm6
17032 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17033 ; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
17034 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4
17035 ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm4
17036 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm29
17037 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17038 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm1
17039 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm8
17040 ; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
17041 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm30
17042 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm10
17043 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12
17044 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm0, %zmm12
17045 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm13
17046 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17047 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17048 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17049 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm14
17050 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17051 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1
17052 ; AVX512DQ-NEXT: vpermt2d %zmm22, %zmm0, %zmm1
17053 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm17
17054 ; AVX512DQ-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17055 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm4
17056 ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
17057 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17058 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17059 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1
17060 ; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm0, %zmm1
17061 ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
17062 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12
17063 ; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm24
17064 ; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm12
17065 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17066 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm5
17067 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17068 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17069 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17070 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1
17071 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
17072 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm22
17073 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4
17074 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm0, %zmm4
17075 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17076 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm1
17077 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17078 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %zmm5
17079 ; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17080 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm1
17081 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm7
17082 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %zmm19
17083 ; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm7, %zmm0
17084 ; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17085 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17086 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
17087 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17088 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
17089 ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17090 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1
17091 ; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
17092 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4
17093 ; AVX512DQ-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17094 ; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm4
17095 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17096 ; AVX512DQ-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17097 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1
17098 ; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm31
17099 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm1
17100 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm12
17101 ; AVX512DQ-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17102 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm30
17103 ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm12
17104 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17105 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17106 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17107 ; AVX512DQ-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17108 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1
17109 ; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
17110 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4
17111 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17112 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
17113 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17114 ; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17115 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm1
17116 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
17117 ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm1
17118 ; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17119 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm12
17120 ; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm12
17121 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17122 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17123 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17124 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm1
17125 ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm0, %zmm1
17126 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm4
17127 ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
17128 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17129 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm1
17130 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17131 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
17132 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12
17133 ; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm0, %zmm12
17134 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17135 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17136 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17137 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17138 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1
17139 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
17140 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4
17141 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17142 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
17143 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17144 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
17145 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm1
17146 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
17147 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm0, %zmm1
17148 ; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm7, %zmm0
17149 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17150 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
17151 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17152 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
17153 ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17154 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
17155 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1
17156 ; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
17157 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4
17158 ; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm4
17159 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17160 ; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm8
17161 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm10
17162 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7]
17163 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17164 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17165 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm1
17166 ; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
17167 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4
17168 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
17169 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17170 ; AVX512DQ-NEXT: vpermt2d %zmm13, %zmm0, %zmm15
17171 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm16
17172 ; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm9
17173 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm29
17174 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7]
17175 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17176 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17177 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17178 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1
17179 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
17180 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
17181 ; AVX512DQ-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17182 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm4
17183 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm7
17184 ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
17185 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17186 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
17187 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1
17188 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm25
17189 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
17190 ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
17191 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12
17192 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm15
17193 ; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm0, %zmm12
17194 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17195 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17196 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17197 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm13
17198 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1
17199 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
17200 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4
17201 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
17202 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17203 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm6
17204 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm1
17205 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm0, %zmm1
17206 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm24
17207 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
17208 ; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm14, %zmm0
17209 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17210 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
17211 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17212 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
17213 ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17214 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1
17215 ; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
17216 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4
17217 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17218 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
17219 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17220 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17221 ; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm21
17222 ; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm1
17223 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17224 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
17225 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17226 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17227 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17228 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
17229 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm1
17230 ; AVX512DQ-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17231 ; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
17232 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4
17233 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
17234 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm31
17235 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17236 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17237 ; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
17238 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17239 ; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
17240 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17241 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17242 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17243 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1
17244 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
17245 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm4
17246 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm4
17247 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm26
17248 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17249 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1
17250 ; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm9
17251 ; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
17252 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm12
17253 ; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm0, %zmm12
17254 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17255 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17256 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17257 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm1
17258 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
17259 ; AVX512DQ-NEXT: vmovdqa64 %zmm18, %zmm25
17260 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4
17261 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17262 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
17263 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17264 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm1
17265 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm18
17266 ; AVX512DQ-NEXT: vpermt2d %zmm24, %zmm0, %zmm1
17267 ; AVX512DQ-NEXT: vpermi2d %zmm19, %zmm14, %zmm0
17268 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm24
17269 ; AVX512DQ-NEXT: vmovdqa64 %zmm14, %zmm13
17270 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17271 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
17272 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17273 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
17274 ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17275 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1
17276 ; AVX512DQ-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17277 ; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
17278 ; AVX512DQ-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17279 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm4
17280 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3
17281 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
17282 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17283 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
17284 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1
17285 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm6
17286 ; AVX512DQ-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17287 ; AVX512DQ-NEXT: vpermt2d %zmm21, %zmm0, %zmm1
17288 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17289 ; AVX512DQ-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17290 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
17291 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17292 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17293 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17294 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm1
17295 ; AVX512DQ-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
17296 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4
17297 ; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm4
17298 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17299 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17300 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1
17301 ; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
17302 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17303 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12
17304 ; AVX512DQ-NEXT: vmovdqa64 %zmm29, %zmm11
17305 ; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
17306 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17307 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17308 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17309 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
17310 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm1
17311 ; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
17312 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm31
17313 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
17314 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm4
17315 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm19
17316 ; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm0, %zmm4
17317 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17318 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17319 ; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm0, %zmm1
17320 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm26
17321 ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload
17322 ; AVX512DQ-NEXT: vpermt2d %zmm15, %zmm0, %zmm12
17323 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm29
17324 ; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17325 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17326 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17327 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17328 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
17329 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm1
17330 ; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
17331 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4
17332 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm14
17333 ; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
17334 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17335 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
17336 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1
17337 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
17338 ; AVX512DQ-NEXT: vmovdqa64 %zmm13, %zmm5
17339 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm13
17340 ; AVX512DQ-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17341 ; AVX512DQ-NEXT: vpermi2d %zmm24, %zmm5, %zmm0
17342 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17343 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
17344 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17345 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
17346 ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17347 ; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm17
17348 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm27
17349 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1}
17350 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm1
17351 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm1
17352 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17353 ; AVX512DQ-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
17354 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17355 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1
17356 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17357 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
17358 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm1
17359 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17360 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
17361 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm4
17362 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
17363 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm4
17364 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17365 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm1
17366 ; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm15
17367 ; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
17368 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm8
17369 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm12
17370 ; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm0, %zmm12
17371 ; AVX512DQ-NEXT: vmovdqa64 %zmm11, %zmm2
17372 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17373 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17374 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17375 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm1
17376 ; AVX512DQ-NEXT: vpermt2d %zmm31, %zmm0, %zmm1
17377 ; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm16
17378 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm4
17379 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm27
17380 ; AVX512DQ-NEXT: vpermt2d %zmm19, %zmm0, %zmm4
17381 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm20
17382 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17383 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
17384 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm1
17385 ; AVX512DQ-NEXT: vpermt2d %zmm26, %zmm0, %zmm1
17386 ; AVX512DQ-NEXT: vmovdqa64 %zmm26, %zmm17
17387 ; AVX512DQ-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
17388 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12
17389 ; AVX512DQ-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
17390 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17391 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17392 ; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17393 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm1
17394 ; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
17395 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm4
17396 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm0, %zmm4
17397 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17398 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm1
17399 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
17400 ; AVX512DQ-NEXT: vpermi2d %zmm13, %zmm5, %zmm0
17401 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17402 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
17403 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17404 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
17405 ; AVX512DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17406 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm4
17407 ; AVX512DQ-NEXT: vmovdqa64 %zmm24, %zmm30
17408 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm30
17409 ; AVX512DQ-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
17410 ; AVX512DQ-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17411 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm4
17412 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, %zmm19
17413 ; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm11
17414 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm1, %zmm11
17415 ; AVX512DQ-NEXT: vpermt2d %zmm7, %zmm0, %zmm28
17416 ; AVX512DQ-NEXT: vmovdqa64 %zmm15, %zmm9
17417 ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm1, %zmm9
17418 ; AVX512DQ-NEXT: vpermt2d %zmm8, %zmm0, %zmm15
17419 ; AVX512DQ-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17420 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17421 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm8
17422 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm8
17423 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
17424 ; AVX512DQ-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17425 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm31
17426 ; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm1, %zmm31
17427 ; AVX512DQ-NEXT: vpermt2d %zmm16, %zmm0, %zmm23
17428 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm16
17429 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm29
17430 ; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm1, %zmm29
17431 ; AVX512DQ-NEXT: vpermt2d %zmm20, %zmm0, %zmm27
17432 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm26
17433 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm2
17434 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm5
17435 ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm1, %zmm5
17436 ; AVX512DQ-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
17437 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17438 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm4
17439 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17440 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm4
17441 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm6
17442 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm15
17443 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17444 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm23
17445 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17446 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm23
17447 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
17448 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm17
17449 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17450 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm24
17451 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17452 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm1, %zmm24
17453 ; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
17454 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm27
17455 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17456 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, %zmm13
17457 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17458 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm1, %zmm13
17459 ; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm0, %zmm12
17460 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
17461 ; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm3
17462 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17463 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm1, %zmm3
17464 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm10
17465 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm6
17466 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm20
17467 ; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm1, %zmm20
17468 ; AVX512DQ-NEXT: vpermt2d %zmm25, %zmm0, %zmm6
17469 ; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm21
17470 ; AVX512DQ-NEXT: vmovdqa64 %zmm22, %zmm25
17471 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm1, %zmm25
17472 ; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm0, %zmm22
17473 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
17474 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm14
17475 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm1, %zmm14
17476 ; AVX512DQ-NEXT: vpermt2d %zmm18, %zmm0, %zmm7
17477 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17478 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17479 ; AVX512DQ-NEXT: vpermi2d %zmm6, %zmm2, %zmm1
17480 ; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm0, %zmm2
17481 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm6
17482 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
17483 ; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1}
17484 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
17485 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7]
17486 ; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1}
17487 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
17488 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7]
17489 ; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1}
17490 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3
17491 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1}
17492 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
17493 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1
17494 ; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1}
17495 ; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
17496 ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
17497 ; AVX512DQ-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
17498 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4
17499 ; AVX512DQ-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1}
17500 ; AVX512DQ-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
17501 ; AVX512DQ-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7]
17502 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5
17503 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1}
17504 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7]
17505 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9
17506 ; AVX512DQ-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1}
17507 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7]
17508 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7
17509 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17510 ; AVX512DQ-NEXT: vmovaps %zmm8, 192(%rsi)
17511 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17512 ; AVX512DQ-NEXT: vmovaps %zmm8, 128(%rsi)
17513 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17514 ; AVX512DQ-NEXT: vmovaps %zmm8, 64(%rsi)
17515 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17516 ; AVX512DQ-NEXT: vmovaps %zmm6, (%rsi)
17517 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17518 ; AVX512DQ-NEXT: vmovaps %zmm6, 192(%rdx)
17519 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17520 ; AVX512DQ-NEXT: vmovaps %zmm6, (%rdx)
17521 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17522 ; AVX512DQ-NEXT: vmovaps %zmm6, 64(%rdx)
17523 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17524 ; AVX512DQ-NEXT: vmovaps %zmm6, 128(%rdx)
17525 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17526 ; AVX512DQ-NEXT: vmovaps %zmm6, 192(%rcx)
17527 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17528 ; AVX512DQ-NEXT: vmovaps %zmm6, (%rcx)
17529 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17530 ; AVX512DQ-NEXT: vmovaps %zmm6, 64(%rcx)
17531 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17532 ; AVX512DQ-NEXT: vmovaps %zmm6, 128(%rcx)
17533 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17534 ; AVX512DQ-NEXT: vmovaps %zmm6, 192(%r8)
17535 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17536 ; AVX512DQ-NEXT: vmovaps %zmm6, (%r8)
17537 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17538 ; AVX512DQ-NEXT: vmovaps %zmm6, 64(%r8)
17539 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17540 ; AVX512DQ-NEXT: vmovaps %zmm6, 128(%r8)
17541 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17542 ; AVX512DQ-NEXT: vmovaps %zmm6, 192(%r9)
17543 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17544 ; AVX512DQ-NEXT: vmovaps %zmm6, (%r9)
17545 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17546 ; AVX512DQ-NEXT: vmovaps %zmm6, 64(%r9)
17547 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17548 ; AVX512DQ-NEXT: vmovaps %zmm6, 128(%r9)
17549 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
17550 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17551 ; AVX512DQ-NEXT: vmovaps %zmm6, 192(%rax)
17552 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17553 ; AVX512DQ-NEXT: vmovaps %zmm6, (%rax)
17554 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17555 ; AVX512DQ-NEXT: vmovaps %zmm6, 64(%rax)
17556 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17557 ; AVX512DQ-NEXT: vmovaps %zmm6, 128(%rax)
17558 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
17559 ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 192(%rax)
17560 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax)
17561 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rax)
17562 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rax)
17563 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
17564 ; AVX512DQ-NEXT: vmovdqa64 %zmm9, 128(%rax)
17565 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 192(%rax)
17566 ; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rax)
17567 ; AVX512DQ-NEXT: vmovdqa64 %zmm4, 64(%rax)
17568 ; AVX512DQ-NEXT: addq $3144, %rsp # imm = 0xC48
17569 ; AVX512DQ-NEXT: vzeroupper
17570 ; AVX512DQ-NEXT: retq
17572 ; AVX512DQ-FCP-LABEL: load_i32_stride8_vf64:
17573 ; AVX512DQ-FCP: # %bb.0:
17574 ; AVX512DQ-FCP-NEXT: subq $3144, %rsp # imm = 0xC48
17575 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %zmm11
17576 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17577 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18
17578 ; AVX512DQ-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31
17579 ; AVX512DQ-FCP-NEXT: vmovaps 1536(%rdi), %zmm0
17580 ; AVX512DQ-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill
17581 ; AVX512DQ-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm24
17582 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17583 ; AVX512DQ-FCP-NEXT: vmovaps 1664(%rdi), %zmm0
17584 ; AVX512DQ-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17585 ; AVX512DQ-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21
17586 ; AVX512DQ-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26
17587 ; AVX512DQ-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm22
17588 ; AVX512DQ-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5
17589 ; AVX512DQ-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14
17590 ; AVX512DQ-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3
17591 ; AVX512DQ-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19
17592 ; AVX512DQ-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2
17593 ; AVX512DQ-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm13
17594 ; AVX512DQ-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27
17595 ; AVX512DQ-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20
17596 ; AVX512DQ-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10
17597 ; AVX512DQ-FCP-NEXT: vmovdqa64 576(%rdi), %zmm17
17598 ; AVX512DQ-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7
17599 ; AVX512DQ-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9
17600 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17601 ; AVX512DQ-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12
17602 ; AVX512DQ-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6
17603 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17604 ; AVX512DQ-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28
17605 ; AVX512DQ-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23
17606 ; AVX512DQ-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4
17607 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
17608 ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17609 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm1
17610 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm16
17611 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
17612 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
17613 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4
17614 ; AVX512DQ-FCP-NEXT: movb $-64, %al
17615 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
17616 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17617 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm1
17618 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm15
17619 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1
17620 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm12
17621 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm9
17622 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm12
17623 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm25
17624 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17625 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17626 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17627 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1
17628 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm6
17629 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17630 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
17631 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4
17632 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm4
17633 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm29
17634 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17635 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm1
17636 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
17637 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
17638 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm30
17639 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
17640 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
17641 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm12
17642 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm13
17643 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17644 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17645 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17646 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14
17647 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17648 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
17649 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm1
17650 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm17
17651 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17652 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm4
17653 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
17654 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17655 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17656 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
17657 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1
17658 ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
17659 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
17660 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm24
17661 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12
17662 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17663 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5
17664 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17665 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17666 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17667 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
17668 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
17669 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22
17670 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
17671 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm4
17672 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17673 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
17674 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17675 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5
17676 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17677 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm1
17678 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm7
17679 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %zmm19
17680 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0
17681 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17682 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17683 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
17684 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17685 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
17686 ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17687 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1
17688 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
17689 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4
17690 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17691 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4
17692 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17693 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17694 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
17695 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm31
17696 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1
17697 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm12
17698 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17699 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm30
17700 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12
17701 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17702 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17703 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17704 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17705 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm1
17706 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
17707 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
17708 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17709 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
17710 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17711 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17712 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm1
17713 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
17714 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1
17715 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17716 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm12
17717 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12
17718 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17719 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17720 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17721 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm1
17722 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm1
17723 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm4
17724 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
17725 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17726 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
17727 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17728 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
17729 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
17730 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12
17731 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17732 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17733 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17734 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
17735 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1
17736 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
17737 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
17738 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17739 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
17740 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17741 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
17742 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm1
17743 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
17744 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1
17745 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0
17746 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17747 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
17748 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17749 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
17750 ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17751 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
17752 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1
17753 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
17754 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4
17755 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4
17756 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17757 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm8
17758 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm10
17759 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7]
17760 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17761 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17762 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm1
17763 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
17764 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
17765 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
17766 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17767 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm15
17768 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm16
17769 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm9
17770 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm29
17771 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7]
17772 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17773 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17774 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17775 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
17776 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
17777 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
17778 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17779 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm4
17780 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm7
17781 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
17782 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17783 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
17784 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1
17785 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm25
17786 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
17787 ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
17788 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
17789 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm15
17790 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12
17791 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17792 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17793 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17794 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
17795 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1
17796 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
17797 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
17798 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
17799 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17800 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm6
17801 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm1
17802 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1
17803 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm24
17804 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
17805 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0
17806 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17807 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
17808 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17809 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
17810 ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17811 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1
17812 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
17813 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4
17814 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17815 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
17816 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17817 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17818 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm21
17819 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1
17820 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17821 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
17822 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17823 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17824 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17825 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
17826 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm1
17827 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17828 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
17829 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
17830 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
17831 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm31
17832 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17833 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17834 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
17835 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17836 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
17837 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17838 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17839 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17840 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
17841 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
17842 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm4
17843 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4
17844 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm26
17845 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17846 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1
17847 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm9
17848 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
17849 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
17850 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12
17851 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17852 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17853 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17854 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm1
17855 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
17856 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, %zmm25
17857 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
17858 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
17859 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
17860 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17861 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm1
17862 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm18
17863 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1
17864 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0
17865 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm24
17866 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, %zmm13
17867 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17868 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
17869 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17870 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
17871 ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17872 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm1
17873 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17874 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
17875 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17876 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm4
17877 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm3
17878 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
17879 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17880 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
17881 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1
17882 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm6
17883 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17884 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1
17885 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17886 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17887 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
17888 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17889 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17890 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17891 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm1
17892 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
17893 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
17894 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm4
17895 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17896 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
17897 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
17898 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
17899 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
17900 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
17901 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm11
17902 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
17903 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17904 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17905 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17906 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
17907 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm1
17908 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
17909 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm31
17910 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
17911 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm4
17912 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm19
17913 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm4
17914 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17915 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
17916 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1
17917 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm26
17918 ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload
17919 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12
17920 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm29
17921 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17922 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17923 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17924 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17925 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
17926 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm1
17927 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
17928 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
17929 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14
17930 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
17931 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17932 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
17933 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1
17934 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
17935 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, %zmm5
17936 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm13
17937 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17938 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm24, %zmm5, %zmm0
17939 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17940 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
17941 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17942 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
17943 ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17944 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm17
17945 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm27
17946 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1}
17947 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm1
17948 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1
17949 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
17950 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
17951 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17952 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1
17953 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17954 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
17955 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm1
17956 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
17957 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
17958 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
17959 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
17960 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4
17961 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17962 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
17963 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, %zmm15
17964 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
17965 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm8
17966 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
17967 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm12
17968 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm2
17969 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17970 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17971 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17972 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm1
17973 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1
17974 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm16
17975 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm4
17976 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm27
17977 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm4
17978 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm20
17979 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17980 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
17981 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm1
17982 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm1
17983 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, %zmm17
17984 ; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
17985 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12
17986 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
17987 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
17988 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
17989 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
17990 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm1
17991 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
17992 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
17993 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm4
17994 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
17995 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm1
17996 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
17997 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm13, %zmm5, %zmm0
17998 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17999 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
18000 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18001 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
18002 ; AVX512DQ-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18003 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm4
18004 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, %zmm30
18005 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm30
18006 ; AVX512DQ-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
18007 ; AVX512DQ-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18008 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm4
18009 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, %zmm19
18010 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm11
18011 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm1, %zmm11
18012 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm28
18013 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm9
18014 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm9
18015 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm15
18016 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18017 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18018 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm8
18019 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm8
18020 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
18021 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18022 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm31
18023 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm31
18024 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm23
18025 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm16
18026 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm29
18027 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm1, %zmm29
18028 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm27
18029 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm26
18030 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
18031 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm5
18032 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm1, %zmm5
18033 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
18034 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18035 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm4
18036 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18037 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm4
18038 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm6
18039 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm15
18040 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18041 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm23
18042 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18043 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm23
18044 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
18045 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm17
18046 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18047 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm24
18048 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18049 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm24
18050 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
18051 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm27
18052 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18053 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm13
18054 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18055 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm13
18056 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm12
18057 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
18058 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, %zmm3
18059 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18060 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm3
18061 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm10
18062 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm6
18063 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm20
18064 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm1, %zmm20
18065 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm6
18066 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm21
18067 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, %zmm25
18068 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm25
18069 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm22
18070 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
18071 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm14
18072 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm14
18073 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm7
18074 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18075 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18076 ; AVX512DQ-FCP-NEXT: vpermi2d %zmm6, %zmm2, %zmm1
18077 ; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm2
18078 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm6
18079 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
18080 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1}
18081 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
18082 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7]
18083 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1}
18084 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
18085 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7]
18086 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1}
18087 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3
18088 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1}
18089 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
18090 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1
18091 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1}
18092 ; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
18093 ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
18094 ; AVX512DQ-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
18095 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4
18096 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1}
18097 ; AVX512DQ-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
18098 ; AVX512DQ-FCP-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7]
18099 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5
18100 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1}
18101 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7]
18102 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9
18103 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1}
18104 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7]
18105 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7
18106 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18107 ; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 192(%rsi)
18108 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18109 ; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 128(%rsi)
18110 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18111 ; AVX512DQ-FCP-NEXT: vmovaps %zmm8, 64(%rsi)
18112 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18113 ; AVX512DQ-FCP-NEXT: vmovaps %zmm6, (%rsi)
18114 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18115 ; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 192(%rdx)
18116 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18117 ; AVX512DQ-FCP-NEXT: vmovaps %zmm6, (%rdx)
18118 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18119 ; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 64(%rdx)
18120 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18121 ; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 128(%rdx)
18122 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18123 ; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 192(%rcx)
18124 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18125 ; AVX512DQ-FCP-NEXT: vmovaps %zmm6, (%rcx)
18126 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18127 ; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 64(%rcx)
18128 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18129 ; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 128(%rcx)
18130 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18131 ; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 192(%r8)
18132 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18133 ; AVX512DQ-FCP-NEXT: vmovaps %zmm6, (%r8)
18134 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18135 ; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 64(%r8)
18136 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18137 ; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 128(%r8)
18138 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18139 ; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 192(%r9)
18140 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18141 ; AVX512DQ-FCP-NEXT: vmovaps %zmm6, (%r9)
18142 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18143 ; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 64(%r9)
18144 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18145 ; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 128(%r9)
18146 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
18147 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18148 ; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 192(%rax)
18149 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18150 ; AVX512DQ-FCP-NEXT: vmovaps %zmm6, (%rax)
18151 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18152 ; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 64(%rax)
18153 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18154 ; AVX512DQ-FCP-NEXT: vmovaps %zmm6, 128(%rax)
18155 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
18156 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax)
18157 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
18158 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax)
18159 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax)
18160 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
18161 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, 128(%rax)
18162 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 192(%rax)
18163 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rax)
18164 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax)
18165 ; AVX512DQ-FCP-NEXT: addq $3144, %rsp # imm = 0xC48
18166 ; AVX512DQ-FCP-NEXT: vzeroupper
18167 ; AVX512DQ-FCP-NEXT: retq
18169 ; AVX512BW-LABEL: load_i32_stride8_vf64:
18170 ; AVX512BW: # %bb.0:
18171 ; AVX512BW-NEXT: subq $3144, %rsp # imm = 0xC48
18172 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm11
18173 ; AVX512BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18174 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm18
18175 ; AVX512BW-NEXT: vmovdqa64 1600(%rdi), %zmm31
18176 ; AVX512BW-NEXT: vmovaps 1536(%rdi), %zmm0
18177 ; AVX512BW-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill
18178 ; AVX512BW-NEXT: vmovdqa64 1728(%rdi), %zmm24
18179 ; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18180 ; AVX512BW-NEXT: vmovaps 1664(%rdi), %zmm0
18181 ; AVX512BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18182 ; AVX512BW-NEXT: vmovdqa64 1856(%rdi), %zmm21
18183 ; AVX512BW-NEXT: vmovdqa64 1792(%rdi), %zmm26
18184 ; AVX512BW-NEXT: vmovdqa64 1984(%rdi), %zmm22
18185 ; AVX512BW-NEXT: vmovdqa64 1920(%rdi), %zmm5
18186 ; AVX512BW-NEXT: vmovdqa64 1088(%rdi), %zmm14
18187 ; AVX512BW-NEXT: vmovdqa64 1024(%rdi), %zmm3
18188 ; AVX512BW-NEXT: vmovdqa64 1216(%rdi), %zmm19
18189 ; AVX512BW-NEXT: vmovdqa64 1152(%rdi), %zmm2
18190 ; AVX512BW-NEXT: vmovdqa64 1344(%rdi), %zmm13
18191 ; AVX512BW-NEXT: vmovdqa64 1280(%rdi), %zmm27
18192 ; AVX512BW-NEXT: vmovdqa64 1472(%rdi), %zmm20
18193 ; AVX512BW-NEXT: vmovdqa64 1408(%rdi), %zmm10
18194 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm17
18195 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm7
18196 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm9
18197 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18198 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm12
18199 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm6
18200 ; AVX512BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18201 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm28
18202 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm23
18203 ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm4
18204 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
18205 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18206 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm1
18207 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm16
18208 ; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
18209 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4
18210 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm4
18211 ; AVX512BW-NEXT: movb $-64, %al
18212 ; AVX512BW-NEXT: kmovd %eax, %k1
18213 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18214 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm1
18215 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm15
18216 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1
18217 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm12
18218 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9
18219 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm12
18220 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm25
18221 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18222 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18223 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18224 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1
18225 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm6
18226 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18227 ; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
18228 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4
18229 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm4
18230 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm29
18231 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18232 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm1
18233 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8
18234 ; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
18235 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm30
18236 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
18237 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12
18238 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm12
18239 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13
18240 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18241 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18242 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18243 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14
18244 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18245 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1
18246 ; AVX512BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1
18247 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm17
18248 ; AVX512BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18249 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4
18250 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
18251 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18252 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18253 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1
18254 ; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1
18255 ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
18256 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12
18257 ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm24
18258 ; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm12
18259 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18260 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm5
18261 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18262 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18263 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18264 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm1
18265 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
18266 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm22
18267 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4
18268 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm4
18269 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18270 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1
18271 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18272 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm5
18273 ; AVX512BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18274 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm1
18275 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm7
18276 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm19
18277 ; AVX512BW-NEXT: vpermi2d %zmm19, %zmm7, %zmm0
18278 ; AVX512BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18279 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
18280 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
18281 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18282 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
18283 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18284 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1
18285 ; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
18286 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4
18287 ; AVX512BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18288 ; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4
18289 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18290 ; AVX512BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18291 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1
18292 ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm31
18293 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1
18294 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm12
18295 ; AVX512BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18296 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm30
18297 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm12
18298 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18299 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18300 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18301 ; AVX512BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18302 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1
18303 ; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
18304 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4
18305 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18306 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
18307 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18308 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18309 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm1
18310 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
18311 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm1
18312 ; AVX512BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18313 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm12
18314 ; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm12
18315 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18316 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18317 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18318 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1
18319 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm1
18320 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4
18321 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
18322 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18323 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm1
18324 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18325 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
18326 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12
18327 ; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm12
18328 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18329 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18330 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18331 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18332 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1
18333 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
18334 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4
18335 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18336 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
18337 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18338 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
18339 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1
18340 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
18341 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1
18342 ; AVX512BW-NEXT: vpermi2d %zmm19, %zmm7, %zmm0
18343 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
18344 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
18345 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18346 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
18347 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18348 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
18349 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1
18350 ; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
18351 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4
18352 ; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4
18353 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18354 ; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm8
18355 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm10
18356 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7]
18357 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18358 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18359 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm1
18360 ; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
18361 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4
18362 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
18363 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18364 ; AVX512BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm15
18365 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm16
18366 ; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm9
18367 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm29
18368 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7]
18369 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18370 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18371 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18372 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1
18373 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
18374 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
18375 ; AVX512BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18376 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4
18377 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm7
18378 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
18379 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18380 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
18381 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1
18382 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm25
18383 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
18384 ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
18385 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12
18386 ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm15
18387 ; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm12
18388 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18389 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18390 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18391 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13
18392 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1
18393 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
18394 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4
18395 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
18396 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18397 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm6
18398 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm1
18399 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1
18400 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm24
18401 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
18402 ; AVX512BW-NEXT: vpermi2d %zmm19, %zmm14, %zmm0
18403 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
18404 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
18405 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18406 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
18407 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18408 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1
18409 ; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
18410 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4
18411 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18412 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
18413 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18414 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18415 ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm21
18416 ; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm1
18417 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18418 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
18419 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18420 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18421 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18422 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
18423 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1
18424 ; AVX512BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18425 ; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
18426 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4
18427 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
18428 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm31
18429 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18430 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18431 ; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
18432 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18433 ; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
18434 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18435 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18436 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18437 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1
18438 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
18439 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm4
18440 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm4
18441 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm26
18442 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18443 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1
18444 ; AVX512BW-NEXT: vmovdqa64 %zmm25, %zmm9
18445 ; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
18446 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12
18447 ; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm12
18448 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18449 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18450 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18451 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm1
18452 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
18453 ; AVX512BW-NEXT: vmovdqa64 %zmm18, %zmm25
18454 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4
18455 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18456 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
18457 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18458 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm1
18459 ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm18
18460 ; AVX512BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1
18461 ; AVX512BW-NEXT: vpermi2d %zmm19, %zmm14, %zmm0
18462 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm24
18463 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm13
18464 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
18465 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
18466 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18467 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
18468 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18469 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm1
18470 ; AVX512BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18471 ; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
18472 ; AVX512BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18473 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm4
18474 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm3
18475 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
18476 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18477 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
18478 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1
18479 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6
18480 ; AVX512BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18481 ; AVX512BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm1
18482 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18483 ; AVX512BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18484 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
18485 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18486 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18487 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18488 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm1
18489 ; AVX512BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
18490 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4
18491 ; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm4
18492 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18493 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18494 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1
18495 ; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
18496 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18497 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12
18498 ; AVX512BW-NEXT: vmovdqa64 %zmm29, %zmm11
18499 ; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
18500 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18501 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18502 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18503 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
18504 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1
18505 ; AVX512BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
18506 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm31
18507 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
18508 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4
18509 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm19
18510 ; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4
18511 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18512 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
18513 ; AVX512BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1
18514 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm26
18515 ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload
18516 ; AVX512BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm12
18517 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm29
18518 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18519 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18520 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18521 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18522 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
18523 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1
18524 ; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
18525 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4
18526 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14
18527 ; AVX512BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
18528 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18529 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
18530 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1
18531 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
18532 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm5
18533 ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm13
18534 ; AVX512BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18535 ; AVX512BW-NEXT: vpermi2d %zmm24, %zmm5, %zmm0
18536 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
18537 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
18538 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18539 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
18540 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18541 ; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm17
18542 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm27
18543 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1}
18544 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm1
18545 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm1
18546 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18547 ; AVX512BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
18548 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18549 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1
18550 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18551 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
18552 ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm1
18553 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18554 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
18555 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm4
18556 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
18557 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm4
18558 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18559 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm1
18560 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm15
18561 ; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
18562 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm8
18563 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12
18564 ; AVX512BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm12
18565 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm2
18566 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18567 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18568 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18569 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm1
18570 ; AVX512BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm1
18571 ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm16
18572 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm4
18573 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm27
18574 ; AVX512BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm4
18575 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm20
18576 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18577 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
18578 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm1
18579 ; AVX512BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm1
18580 ; AVX512BW-NEXT: vmovdqa64 %zmm26, %zmm17
18581 ; AVX512BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
18582 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12
18583 ; AVX512BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
18584 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18585 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18586 ; AVX512BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18587 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm1
18588 ; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
18589 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm4
18590 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm4
18591 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18592 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm1
18593 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
18594 ; AVX512BW-NEXT: vpermi2d %zmm13, %zmm5, %zmm0
18595 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
18596 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
18597 ; AVX512BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18598 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
18599 ; AVX512BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18600 ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm4
18601 ; AVX512BW-NEXT: vmovdqa64 %zmm24, %zmm30
18602 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm30
18603 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
18604 ; AVX512BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18605 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm4
18606 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm19
18607 ; AVX512BW-NEXT: vmovdqa64 %zmm28, %zmm11
18608 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm1, %zmm11
18609 ; AVX512BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm28
18610 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm9
18611 ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm9
18612 ; AVX512BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm15
18613 ; AVX512BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18614 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18615 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm8
18616 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm8
18617 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
18618 ; AVX512BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18619 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm31
18620 ; AVX512BW-NEXT: vpermt2d %zmm16, %zmm1, %zmm31
18621 ; AVX512BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm23
18622 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm16
18623 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm29
18624 ; AVX512BW-NEXT: vpermt2d %zmm20, %zmm1, %zmm29
18625 ; AVX512BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm27
18626 ; AVX512BW-NEXT: vmovdqa64 %zmm27, %zmm26
18627 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm2
18628 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm5
18629 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm1, %zmm5
18630 ; AVX512BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
18631 ; AVX512BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18632 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4
18633 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18634 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm4
18635 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm6
18636 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15
18637 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18638 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm23
18639 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18640 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm23
18641 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
18642 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm17
18643 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18644 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm24
18645 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18646 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm24
18647 ; AVX512BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
18648 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm27
18649 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
18650 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13
18651 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18652 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm13
18653 ; AVX512BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm12
18654 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
18655 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm3
18656 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18657 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm3
18658 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm10
18659 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm6
18660 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm20
18661 ; AVX512BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm20
18662 ; AVX512BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm6
18663 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm21
18664 ; AVX512BW-NEXT: vmovdqa64 %zmm22, %zmm25
18665 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm25
18666 ; AVX512BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm22
18667 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
18668 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm14
18669 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm1, %zmm14
18670 ; AVX512BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm7
18671 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18672 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18673 ; AVX512BW-NEXT: vpermi2d %zmm6, %zmm2, %zmm1
18674 ; AVX512BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm2
18675 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6
18676 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
18677 ; AVX512BW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1}
18678 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
18679 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7]
18680 ; AVX512BW-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1}
18681 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
18682 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7]
18683 ; AVX512BW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1}
18684 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3
18685 ; AVX512BW-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1}
18686 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
18687 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1
18688 ; AVX512BW-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1}
18689 ; AVX512BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
18690 ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
18691 ; AVX512BW-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
18692 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4
18693 ; AVX512BW-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1}
18694 ; AVX512BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
18695 ; AVX512BW-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7]
18696 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5
18697 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1}
18698 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7]
18699 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9
18700 ; AVX512BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1}
18701 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7]
18702 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7
18703 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18704 ; AVX512BW-NEXT: vmovaps %zmm8, 192(%rsi)
18705 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18706 ; AVX512BW-NEXT: vmovaps %zmm8, 128(%rsi)
18707 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18708 ; AVX512BW-NEXT: vmovaps %zmm8, 64(%rsi)
18709 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18710 ; AVX512BW-NEXT: vmovaps %zmm6, (%rsi)
18711 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18712 ; AVX512BW-NEXT: vmovaps %zmm6, 192(%rdx)
18713 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18714 ; AVX512BW-NEXT: vmovaps %zmm6, (%rdx)
18715 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18716 ; AVX512BW-NEXT: vmovaps %zmm6, 64(%rdx)
18717 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18718 ; AVX512BW-NEXT: vmovaps %zmm6, 128(%rdx)
18719 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18720 ; AVX512BW-NEXT: vmovaps %zmm6, 192(%rcx)
18721 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18722 ; AVX512BW-NEXT: vmovaps %zmm6, (%rcx)
18723 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18724 ; AVX512BW-NEXT: vmovaps %zmm6, 64(%rcx)
18725 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18726 ; AVX512BW-NEXT: vmovaps %zmm6, 128(%rcx)
18727 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18728 ; AVX512BW-NEXT: vmovaps %zmm6, 192(%r8)
18729 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18730 ; AVX512BW-NEXT: vmovaps %zmm6, (%r8)
18731 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18732 ; AVX512BW-NEXT: vmovaps %zmm6, 64(%r8)
18733 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18734 ; AVX512BW-NEXT: vmovaps %zmm6, 128(%r8)
18735 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18736 ; AVX512BW-NEXT: vmovaps %zmm6, 192(%r9)
18737 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18738 ; AVX512BW-NEXT: vmovaps %zmm6, (%r9)
18739 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18740 ; AVX512BW-NEXT: vmovaps %zmm6, 64(%r9)
18741 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18742 ; AVX512BW-NEXT: vmovaps %zmm6, 128(%r9)
18743 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
18744 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18745 ; AVX512BW-NEXT: vmovaps %zmm6, 192(%rax)
18746 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18747 ; AVX512BW-NEXT: vmovaps %zmm6, (%rax)
18748 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18749 ; AVX512BW-NEXT: vmovaps %zmm6, 64(%rax)
18750 ; AVX512BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18751 ; AVX512BW-NEXT: vmovaps %zmm6, 128(%rax)
18752 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
18753 ; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rax)
18754 ; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rax)
18755 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax)
18756 ; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax)
18757 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
18758 ; AVX512BW-NEXT: vmovdqa64 %zmm9, 128(%rax)
18759 ; AVX512BW-NEXT: vmovdqa64 %zmm5, 192(%rax)
18760 ; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax)
18761 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%rax)
18762 ; AVX512BW-NEXT: addq $3144, %rsp # imm = 0xC48
18763 ; AVX512BW-NEXT: vzeroupper
18764 ; AVX512BW-NEXT: retq
18766 ; AVX512BW-FCP-LABEL: load_i32_stride8_vf64:
18767 ; AVX512BW-FCP: # %bb.0:
18768 ; AVX512BW-FCP-NEXT: subq $3144, %rsp # imm = 0xC48
18769 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm11
18770 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18771 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18
18772 ; AVX512BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31
18773 ; AVX512BW-FCP-NEXT: vmovaps 1536(%rdi), %zmm0
18774 ; AVX512BW-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill
18775 ; AVX512BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm24
18776 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18777 ; AVX512BW-FCP-NEXT: vmovaps 1664(%rdi), %zmm0
18778 ; AVX512BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18779 ; AVX512BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21
18780 ; AVX512BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26
18781 ; AVX512BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm22
18782 ; AVX512BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5
18783 ; AVX512BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14
18784 ; AVX512BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3
18785 ; AVX512BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19
18786 ; AVX512BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2
18787 ; AVX512BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm13
18788 ; AVX512BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27
18789 ; AVX512BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20
18790 ; AVX512BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10
18791 ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm17
18792 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7
18793 ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9
18794 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18795 ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12
18796 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6
18797 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18798 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28
18799 ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23
18800 ; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4
18801 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
18802 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18803 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1
18804 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16
18805 ; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
18806 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
18807 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4
18808 ; AVX512BW-FCP-NEXT: movb $-64, %al
18809 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1
18810 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18811 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1
18812 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15
18813 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1
18814 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12
18815 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9
18816 ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm12
18817 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm25
18818 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18819 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18820 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18821 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1
18822 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm6
18823 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18824 ; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
18825 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4
18826 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm4
18827 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm29
18828 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18829 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1
18830 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
18831 ; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
18832 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm30
18833 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
18834 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
18835 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm12
18836 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13
18837 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18838 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18839 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18840 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14
18841 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18842 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
18843 ; AVX512BW-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm1
18844 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm17
18845 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18846 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4
18847 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
18848 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18849 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18850 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
18851 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1
18852 ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
18853 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
18854 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm24
18855 ; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12
18856 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18857 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5
18858 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18859 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18860 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18861 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
18862 ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
18863 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22
18864 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
18865 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm4
18866 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18867 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
18868 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18869 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5
18870 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18871 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm1
18872 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm7
18873 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm19
18874 ; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0
18875 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18876 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
18877 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
18878 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18879 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
18880 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18881 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1
18882 ; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
18883 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4
18884 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18885 ; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4
18886 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18887 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18888 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
18889 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm31
18890 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1
18891 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm12
18892 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18893 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm30
18894 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12
18895 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18896 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18897 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18898 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18899 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1
18900 ; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
18901 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
18902 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
18903 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
18904 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18905 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18906 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1
18907 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
18908 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1
18909 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18910 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12
18911 ; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12
18912 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18913 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18914 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18915 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1
18916 ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm1
18917 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4
18918 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
18919 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18920 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
18921 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
18922 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
18923 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
18924 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12
18925 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18926 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18927 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18928 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
18929 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1
18930 ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
18931 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
18932 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
18933 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
18934 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18935 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
18936 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1
18937 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
18938 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1
18939 ; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0
18940 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
18941 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
18942 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18943 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
18944 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
18945 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
18946 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1
18947 ; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
18948 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4
18949 ; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4
18950 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18951 ; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm8
18952 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm10
18953 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7]
18954 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18955 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18956 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1
18957 ; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
18958 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
18959 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
18960 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18961 ; AVX512BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm15
18962 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm16
18963 ; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm9
18964 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm29
18965 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7]
18966 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18967 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18968 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
18969 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
18970 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
18971 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
18972 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18973 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4
18974 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm7
18975 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
18976 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18977 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
18978 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1
18979 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm25
18980 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
18981 ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
18982 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
18983 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm15
18984 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12
18985 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
18986 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
18987 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
18988 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
18989 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1
18990 ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
18991 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
18992 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
18993 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
18994 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm6
18995 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1
18996 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1
18997 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm24
18998 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
18999 ; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0
19000 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
19001 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
19002 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19003 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
19004 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19005 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1
19006 ; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
19007 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4
19008 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19009 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
19010 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19011 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19012 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm21
19013 ; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1
19014 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19015 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
19016 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19017 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19018 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19019 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
19020 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1
19021 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19022 ; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
19023 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
19024 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
19025 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm31
19026 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19027 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19028 ; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
19029 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19030 ; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
19031 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19032 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19033 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19034 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
19035 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
19036 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4
19037 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4
19038 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm26
19039 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19040 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1
19041 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9
19042 ; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
19043 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
19044 ; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12
19045 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19046 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19047 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19048 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1
19049 ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
19050 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm25
19051 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
19052 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19053 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
19054 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19055 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1
19056 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm18
19057 ; AVX512BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1
19058 ; AVX512BW-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0
19059 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm24
19060 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13
19061 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
19062 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
19063 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19064 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
19065 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19066 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1
19067 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19068 ; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
19069 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19070 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4
19071 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3
19072 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
19073 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19074 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
19075 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1
19076 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6
19077 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19078 ; AVX512BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1
19079 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19080 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19081 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
19082 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19083 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19084 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19085 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1
19086 ; AVX512BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
19087 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
19088 ; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm4
19089 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19090 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19091 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
19092 ; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
19093 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19094 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
19095 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm11
19096 ; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
19097 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19098 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19099 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19100 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
19101 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1
19102 ; AVX512BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
19103 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm31
19104 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
19105 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4
19106 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm19
19107 ; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm4
19108 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19109 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19110 ; AVX512BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1
19111 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26
19112 ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload
19113 ; AVX512BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12
19114 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm29
19115 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19116 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19117 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19118 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19119 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
19120 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1
19121 ; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
19122 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
19123 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14
19124 ; AVX512BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
19125 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19126 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
19127 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1
19128 ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
19129 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5
19130 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm13
19131 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19132 ; AVX512BW-FCP-NEXT: vpermi2d %zmm24, %zmm5, %zmm0
19133 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
19134 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
19135 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19136 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
19137 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19138 ; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm17
19139 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm27
19140 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1}
19141 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1
19142 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1
19143 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19144 ; AVX512BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
19145 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19146 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1
19147 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19148 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
19149 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1
19150 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19151 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
19152 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
19153 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
19154 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4
19155 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19156 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
19157 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm15
19158 ; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
19159 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm8
19160 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
19161 ; AVX512BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm12
19162 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2
19163 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19164 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19165 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19166 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1
19167 ; AVX512BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1
19168 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm16
19169 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4
19170 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm27
19171 ; AVX512BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm4
19172 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm20
19173 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19174 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
19175 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1
19176 ; AVX512BW-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm1
19177 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17
19178 ; AVX512BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
19179 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12
19180 ; AVX512BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
19181 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19182 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19183 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19184 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1
19185 ; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
19186 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
19187 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm4
19188 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19189 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1
19190 ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
19191 ; AVX512BW-FCP-NEXT: vpermi2d %zmm13, %zmm5, %zmm0
19192 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
19193 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
19194 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19195 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
19196 ; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19197 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm4
19198 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm30
19199 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm30
19200 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
19201 ; AVX512BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19202 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm4
19203 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm19
19204 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm11
19205 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm1, %zmm11
19206 ; AVX512BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm28
19207 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9
19208 ; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm9
19209 ; AVX512BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm15
19210 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19211 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19212 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8
19213 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm8
19214 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
19215 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19216 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm31
19217 ; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm31
19218 ; AVX512BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm23
19219 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm16
19220 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm29
19221 ; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm1, %zmm29
19222 ; AVX512BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm27
19223 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm26
19224 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
19225 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5
19226 ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm1, %zmm5
19227 ; AVX512BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
19228 ; AVX512BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19229 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4
19230 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19231 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm4
19232 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm6
19233 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15
19234 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19235 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm23
19236 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19237 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm23
19238 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
19239 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17
19240 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19241 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm24
19242 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19243 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm24
19244 ; AVX512BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
19245 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm27
19246 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19247 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13
19248 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19249 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm13
19250 ; AVX512BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm12
19251 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
19252 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3
19253 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19254 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm3
19255 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm10
19256 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6
19257 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm20
19258 ; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm1, %zmm20
19259 ; AVX512BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm6
19260 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm21
19261 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm25
19262 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm25
19263 ; AVX512BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm22
19264 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
19265 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14
19266 ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm14
19267 ; AVX512BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm7
19268 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19269 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19270 ; AVX512BW-FCP-NEXT: vpermi2d %zmm6, %zmm2, %zmm1
19271 ; AVX512BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm2
19272 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6
19273 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
19274 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1}
19275 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
19276 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7]
19277 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1}
19278 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
19279 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7]
19280 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1}
19281 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3
19282 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1}
19283 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
19284 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1
19285 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1}
19286 ; AVX512BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
19287 ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
19288 ; AVX512BW-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
19289 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4
19290 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1}
19291 ; AVX512BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
19292 ; AVX512BW-FCP-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7]
19293 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5
19294 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1}
19295 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7]
19296 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9
19297 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1}
19298 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7]
19299 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7
19300 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19301 ; AVX512BW-FCP-NEXT: vmovaps %zmm8, 192(%rsi)
19302 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19303 ; AVX512BW-FCP-NEXT: vmovaps %zmm8, 128(%rsi)
19304 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19305 ; AVX512BW-FCP-NEXT: vmovaps %zmm8, 64(%rsi)
19306 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19307 ; AVX512BW-FCP-NEXT: vmovaps %zmm6, (%rsi)
19308 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19309 ; AVX512BW-FCP-NEXT: vmovaps %zmm6, 192(%rdx)
19310 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19311 ; AVX512BW-FCP-NEXT: vmovaps %zmm6, (%rdx)
19312 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19313 ; AVX512BW-FCP-NEXT: vmovaps %zmm6, 64(%rdx)
19314 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19315 ; AVX512BW-FCP-NEXT: vmovaps %zmm6, 128(%rdx)
19316 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19317 ; AVX512BW-FCP-NEXT: vmovaps %zmm6, 192(%rcx)
19318 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19319 ; AVX512BW-FCP-NEXT: vmovaps %zmm6, (%rcx)
19320 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19321 ; AVX512BW-FCP-NEXT: vmovaps %zmm6, 64(%rcx)
19322 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19323 ; AVX512BW-FCP-NEXT: vmovaps %zmm6, 128(%rcx)
19324 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19325 ; AVX512BW-FCP-NEXT: vmovaps %zmm6, 192(%r8)
19326 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19327 ; AVX512BW-FCP-NEXT: vmovaps %zmm6, (%r8)
19328 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19329 ; AVX512BW-FCP-NEXT: vmovaps %zmm6, 64(%r8)
19330 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19331 ; AVX512BW-FCP-NEXT: vmovaps %zmm6, 128(%r8)
19332 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19333 ; AVX512BW-FCP-NEXT: vmovaps %zmm6, 192(%r9)
19334 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19335 ; AVX512BW-FCP-NEXT: vmovaps %zmm6, (%r9)
19336 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19337 ; AVX512BW-FCP-NEXT: vmovaps %zmm6, 64(%r9)
19338 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19339 ; AVX512BW-FCP-NEXT: vmovaps %zmm6, 128(%r9)
19340 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
19341 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19342 ; AVX512BW-FCP-NEXT: vmovaps %zmm6, 192(%rax)
19343 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19344 ; AVX512BW-FCP-NEXT: vmovaps %zmm6, (%rax)
19345 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19346 ; AVX512BW-FCP-NEXT: vmovaps %zmm6, 64(%rax)
19347 ; AVX512BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19348 ; AVX512BW-FCP-NEXT: vmovaps %zmm6, 128(%rax)
19349 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
19350 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax)
19351 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
19352 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax)
19353 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax)
19354 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
19355 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 128(%rax)
19356 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 192(%rax)
19357 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax)
19358 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax)
19359 ; AVX512BW-FCP-NEXT: addq $3144, %rsp # imm = 0xC48
19360 ; AVX512BW-FCP-NEXT: vzeroupper
19361 ; AVX512BW-FCP-NEXT: retq
19363 ; AVX512DQ-BW-LABEL: load_i32_stride8_vf64:
19364 ; AVX512DQ-BW: # %bb.0:
19365 ; AVX512DQ-BW-NEXT: subq $3144, %rsp # imm = 0xC48
19366 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm11
19367 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19368 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm18
19369 ; AVX512DQ-BW-NEXT: vmovdqa64 1600(%rdi), %zmm31
19370 ; AVX512DQ-BW-NEXT: vmovaps 1536(%rdi), %zmm0
19371 ; AVX512DQ-BW-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill
19372 ; AVX512DQ-BW-NEXT: vmovdqa64 1728(%rdi), %zmm24
19373 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19374 ; AVX512DQ-BW-NEXT: vmovaps 1664(%rdi), %zmm0
19375 ; AVX512DQ-BW-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19376 ; AVX512DQ-BW-NEXT: vmovdqa64 1856(%rdi), %zmm21
19377 ; AVX512DQ-BW-NEXT: vmovdqa64 1792(%rdi), %zmm26
19378 ; AVX512DQ-BW-NEXT: vmovdqa64 1984(%rdi), %zmm22
19379 ; AVX512DQ-BW-NEXT: vmovdqa64 1920(%rdi), %zmm5
19380 ; AVX512DQ-BW-NEXT: vmovdqa64 1088(%rdi), %zmm14
19381 ; AVX512DQ-BW-NEXT: vmovdqa64 1024(%rdi), %zmm3
19382 ; AVX512DQ-BW-NEXT: vmovdqa64 1216(%rdi), %zmm19
19383 ; AVX512DQ-BW-NEXT: vmovdqa64 1152(%rdi), %zmm2
19384 ; AVX512DQ-BW-NEXT: vmovdqa64 1344(%rdi), %zmm13
19385 ; AVX512DQ-BW-NEXT: vmovdqa64 1280(%rdi), %zmm27
19386 ; AVX512DQ-BW-NEXT: vmovdqa64 1472(%rdi), %zmm20
19387 ; AVX512DQ-BW-NEXT: vmovdqa64 1408(%rdi), %zmm10
19388 ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm17
19389 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm7
19390 ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm9
19391 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19392 ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm12
19393 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm6
19394 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19395 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm28
19396 ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm23
19397 ; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm4
19398 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
19399 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19400 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm1
19401 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm16
19402 ; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
19403 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4
19404 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm4
19405 ; AVX512DQ-BW-NEXT: movb $-64, %al
19406 ; AVX512DQ-BW-NEXT: kmovd %eax, %k1
19407 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19408 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm1
19409 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm15
19410 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1
19411 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm12
19412 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9
19413 ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm12
19414 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm25
19415 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19416 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19417 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19418 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1
19419 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm6
19420 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19421 ; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
19422 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4
19423 ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm4
19424 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm29
19425 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19426 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm1
19427 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8
19428 ; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
19429 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm30
19430 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10
19431 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12
19432 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm12
19433 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm13
19434 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19435 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19436 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19437 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm14
19438 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19439 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1
19440 ; AVX512DQ-BW-NEXT: vpermt2d %zmm22, %zmm0, %zmm1
19441 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm17
19442 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19443 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm4
19444 ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
19445 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19446 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19447 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1
19448 ; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1
19449 ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
19450 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12
19451 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm24
19452 ; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm12
19453 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19454 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm5
19455 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19456 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19457 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19458 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm1
19459 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
19460 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm22
19461 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4
19462 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm4
19463 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19464 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1
19465 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19466 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm5
19467 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19468 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm1
19469 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm7
19470 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm19
19471 ; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm7, %zmm0
19472 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19473 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
19474 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
19475 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19476 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
19477 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19478 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1
19479 ; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
19480 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4
19481 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19482 ; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4
19483 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19484 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19485 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1
19486 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm31
19487 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm1
19488 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm12
19489 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19490 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm30
19491 ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm12
19492 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19493 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19494 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19495 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19496 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1
19497 ; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
19498 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4
19499 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19500 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
19501 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19502 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19503 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm1
19504 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
19505 ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm1
19506 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19507 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm12
19508 ; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm12
19509 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19510 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19511 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19512 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1
19513 ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm1
19514 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm4
19515 ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
19516 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19517 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm1
19518 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19519 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
19520 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12
19521 ; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm12
19522 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19523 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19524 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19525 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19526 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1
19527 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
19528 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4
19529 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19530 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
19531 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19532 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
19533 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1
19534 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
19535 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1
19536 ; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm7, %zmm0
19537 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
19538 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
19539 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19540 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
19541 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19542 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
19543 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1
19544 ; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
19545 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4
19546 ; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm4
19547 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19548 ; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm8
19549 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm10
19550 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7]
19551 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19552 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19553 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm1
19554 ; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
19555 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4
19556 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
19557 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19558 ; AVX512DQ-BW-NEXT: vpermt2d %zmm13, %zmm0, %zmm15
19559 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm16
19560 ; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm9
19561 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm29
19562 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7]
19563 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19564 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19565 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19566 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1
19567 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
19568 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
19569 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19570 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm4
19571 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm7
19572 ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
19573 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19574 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
19575 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1
19576 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm25
19577 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
19578 ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
19579 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12
19580 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm15
19581 ; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm12
19582 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19583 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19584 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19585 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13
19586 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1
19587 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
19588 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4
19589 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
19590 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19591 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm6
19592 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm1
19593 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm1
19594 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm24
19595 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
19596 ; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm14, %zmm0
19597 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
19598 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
19599 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19600 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
19601 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19602 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1
19603 ; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
19604 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4
19605 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19606 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
19607 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19608 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19609 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm21
19610 ; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm1
19611 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19612 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
19613 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19614 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19615 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19616 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
19617 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm1
19618 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19619 ; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
19620 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4
19621 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
19622 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm31
19623 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19624 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19625 ; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
19626 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19627 ; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
19628 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19629 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19630 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19631 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1
19632 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
19633 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm4
19634 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm4
19635 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm26
19636 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19637 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1
19638 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, %zmm9
19639 ; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
19640 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12
19641 ; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm12
19642 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19643 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19644 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19645 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm1
19646 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
19647 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, %zmm25
19648 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4
19649 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
19650 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
19651 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19652 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm1
19653 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm18
19654 ; AVX512DQ-BW-NEXT: vpermt2d %zmm24, %zmm0, %zmm1
19655 ; AVX512DQ-BW-NEXT: vpermi2d %zmm19, %zmm14, %zmm0
19656 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm24
19657 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm13
19658 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
19659 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
19660 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19661 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
19662 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19663 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm1
19664 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19665 ; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
19666 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19667 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm4
19668 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm3
19669 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
19670 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19671 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
19672 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1
19673 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm6
19674 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19675 ; AVX512DQ-BW-NEXT: vpermt2d %zmm21, %zmm0, %zmm1
19676 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19677 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19678 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
19679 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19680 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19681 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19682 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm1
19683 ; AVX512DQ-BW-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
19684 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4
19685 ; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm4
19686 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19687 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19688 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1
19689 ; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
19690 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19691 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12
19692 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm29, %zmm11
19693 ; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
19694 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19695 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19696 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19697 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
19698 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm1
19699 ; AVX512DQ-BW-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
19700 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm31
19701 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
19702 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm4
19703 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm19
19704 ; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm4
19705 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19706 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
19707 ; AVX512DQ-BW-NEXT: vpermt2d %zmm9, %zmm0, %zmm1
19708 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm26
19709 ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload
19710 ; AVX512DQ-BW-NEXT: vpermt2d %zmm15, %zmm0, %zmm12
19711 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm29
19712 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19713 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19714 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19715 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19716 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
19717 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm1
19718 ; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
19719 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4
19720 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm14
19721 ; AVX512DQ-BW-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
19722 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19723 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
19724 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1
19725 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
19726 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm5
19727 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm13
19728 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19729 ; AVX512DQ-BW-NEXT: vpermi2d %zmm24, %zmm5, %zmm0
19730 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
19731 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
19732 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19733 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
19734 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19735 ; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm17
19736 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm27
19737 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1}
19738 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm1
19739 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm1
19740 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19741 ; AVX512DQ-BW-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
19742 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19743 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1
19744 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19745 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
19746 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm1
19747 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19748 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
19749 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm4
19750 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
19751 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm4
19752 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19753 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm1
19754 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm15
19755 ; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
19756 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm8
19757 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12
19758 ; AVX512DQ-BW-NEXT: vpermt2d %zmm11, %zmm0, %zmm12
19759 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm2
19760 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19761 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19762 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19763 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm1
19764 ; AVX512DQ-BW-NEXT: vpermt2d %zmm31, %zmm0, %zmm1
19765 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm16
19766 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm4
19767 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm27
19768 ; AVX512DQ-BW-NEXT: vpermt2d %zmm19, %zmm0, %zmm4
19769 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm20
19770 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19771 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
19772 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm1
19773 ; AVX512DQ-BW-NEXT: vpermt2d %zmm26, %zmm0, %zmm1
19774 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, %zmm17
19775 ; AVX512DQ-BW-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
19776 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12
19777 ; AVX512DQ-BW-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
19778 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
19779 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
19780 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19781 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm1
19782 ; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
19783 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm4
19784 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm4
19785 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
19786 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm1
19787 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
19788 ; AVX512DQ-BW-NEXT: vpermi2d %zmm13, %zmm5, %zmm0
19789 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
19790 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
19791 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19792 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
19793 ; AVX512DQ-BW-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19794 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm4
19795 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, %zmm30
19796 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm30
19797 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
19798 ; AVX512DQ-BW-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19799 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm4
19800 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm19
19801 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, %zmm11
19802 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm1, %zmm11
19803 ; AVX512DQ-BW-NEXT: vpermt2d %zmm7, %zmm0, %zmm28
19804 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm9
19805 ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm1, %zmm9
19806 ; AVX512DQ-BW-NEXT: vpermt2d %zmm8, %zmm0, %zmm15
19807 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19808 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19809 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm8
19810 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm8
19811 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
19812 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19813 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm31
19814 ; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm1, %zmm31
19815 ; AVX512DQ-BW-NEXT: vpermt2d %zmm16, %zmm0, %zmm23
19816 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm16
19817 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm29
19818 ; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm1, %zmm29
19819 ; AVX512DQ-BW-NEXT: vpermt2d %zmm20, %zmm0, %zmm27
19820 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, %zmm26
19821 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm2
19822 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm5
19823 ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm1, %zmm5
19824 ; AVX512DQ-BW-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
19825 ; AVX512DQ-BW-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19826 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm4
19827 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19828 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm4
19829 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm6
19830 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm15
19831 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19832 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm23
19833 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19834 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm23
19835 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
19836 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm17
19837 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19838 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm24
19839 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19840 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm24
19841 ; AVX512DQ-BW-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
19842 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm27
19843 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
19844 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm13
19845 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
19846 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm1, %zmm13
19847 ; AVX512DQ-BW-NEXT: vpermt2d %zmm3, %zmm0, %zmm12
19848 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
19849 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm3
19850 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19851 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm1, %zmm3
19852 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm10
19853 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm6
19854 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm20
19855 ; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm1, %zmm20
19856 ; AVX512DQ-BW-NEXT: vpermt2d %zmm25, %zmm0, %zmm6
19857 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm21
19858 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, %zmm25
19859 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm1, %zmm25
19860 ; AVX512DQ-BW-NEXT: vpermt2d %zmm14, %zmm0, %zmm22
19861 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
19862 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm14
19863 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm1, %zmm14
19864 ; AVX512DQ-BW-NEXT: vpermt2d %zmm18, %zmm0, %zmm7
19865 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
19866 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19867 ; AVX512DQ-BW-NEXT: vpermi2d %zmm6, %zmm2, %zmm1
19868 ; AVX512DQ-BW-NEXT: vpermt2d %zmm6, %zmm0, %zmm2
19869 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6
19870 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
19871 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1}
19872 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
19873 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7]
19874 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1}
19875 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
19876 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7]
19877 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1}
19878 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3
19879 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1}
19880 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
19881 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1
19882 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1}
19883 ; AVX512DQ-BW-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
19884 ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
19885 ; AVX512DQ-BW-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
19886 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4
19887 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1}
19888 ; AVX512DQ-BW-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
19889 ; AVX512DQ-BW-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7]
19890 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5
19891 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1}
19892 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7]
19893 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9
19894 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1}
19895 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7]
19896 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7
19897 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19898 ; AVX512DQ-BW-NEXT: vmovaps %zmm8, 192(%rsi)
19899 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19900 ; AVX512DQ-BW-NEXT: vmovaps %zmm8, 128(%rsi)
19901 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
19902 ; AVX512DQ-BW-NEXT: vmovaps %zmm8, 64(%rsi)
19903 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19904 ; AVX512DQ-BW-NEXT: vmovaps %zmm6, (%rsi)
19905 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19906 ; AVX512DQ-BW-NEXT: vmovaps %zmm6, 192(%rdx)
19907 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19908 ; AVX512DQ-BW-NEXT: vmovaps %zmm6, (%rdx)
19909 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19910 ; AVX512DQ-BW-NEXT: vmovaps %zmm6, 64(%rdx)
19911 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19912 ; AVX512DQ-BW-NEXT: vmovaps %zmm6, 128(%rdx)
19913 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19914 ; AVX512DQ-BW-NEXT: vmovaps %zmm6, 192(%rcx)
19915 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19916 ; AVX512DQ-BW-NEXT: vmovaps %zmm6, (%rcx)
19917 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19918 ; AVX512DQ-BW-NEXT: vmovaps %zmm6, 64(%rcx)
19919 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19920 ; AVX512DQ-BW-NEXT: vmovaps %zmm6, 128(%rcx)
19921 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19922 ; AVX512DQ-BW-NEXT: vmovaps %zmm6, 192(%r8)
19923 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19924 ; AVX512DQ-BW-NEXT: vmovaps %zmm6, (%r8)
19925 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19926 ; AVX512DQ-BW-NEXT: vmovaps %zmm6, 64(%r8)
19927 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19928 ; AVX512DQ-BW-NEXT: vmovaps %zmm6, 128(%r8)
19929 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19930 ; AVX512DQ-BW-NEXT: vmovaps %zmm6, 192(%r9)
19931 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19932 ; AVX512DQ-BW-NEXT: vmovaps %zmm6, (%r9)
19933 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19934 ; AVX512DQ-BW-NEXT: vmovaps %zmm6, 64(%r9)
19935 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19936 ; AVX512DQ-BW-NEXT: vmovaps %zmm6, 128(%r9)
19937 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
19938 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19939 ; AVX512DQ-BW-NEXT: vmovaps %zmm6, 192(%rax)
19940 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19941 ; AVX512DQ-BW-NEXT: vmovaps %zmm6, (%rax)
19942 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19943 ; AVX512DQ-BW-NEXT: vmovaps %zmm6, 64(%rax)
19944 ; AVX512DQ-BW-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
19945 ; AVX512DQ-BW-NEXT: vmovaps %zmm6, 128(%rax)
19946 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
19947 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 192(%rax)
19948 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, (%rax)
19949 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rax)
19950 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rax)
19951 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
19952 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 128(%rax)
19953 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 192(%rax)
19954 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rax)
19955 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%rax)
19956 ; AVX512DQ-BW-NEXT: addq $3144, %rsp # imm = 0xC48
19957 ; AVX512DQ-BW-NEXT: vzeroupper
19958 ; AVX512DQ-BW-NEXT: retq
19960 ; AVX512DQ-BW-FCP-LABEL: load_i32_stride8_vf64:
19961 ; AVX512DQ-BW-FCP: # %bb.0:
19962 ; AVX512DQ-BW-FCP-NEXT: subq $3144, %rsp # imm = 0xC48
19963 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm11
19964 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19965 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm18
19966 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1600(%rdi), %zmm31
19967 ; AVX512DQ-BW-FCP-NEXT: vmovaps 1536(%rdi), %zmm0
19968 ; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, (%rsp) # 64-byte Spill
19969 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1728(%rdi), %zmm24
19970 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19971 ; AVX512DQ-BW-FCP-NEXT: vmovaps 1664(%rdi), %zmm0
19972 ; AVX512DQ-BW-FCP-NEXT: vmovups %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19973 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1856(%rdi), %zmm21
19974 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1792(%rdi), %zmm26
19975 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1984(%rdi), %zmm22
19976 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1920(%rdi), %zmm5
19977 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1088(%rdi), %zmm14
19978 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1024(%rdi), %zmm3
19979 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1216(%rdi), %zmm19
19980 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1152(%rdi), %zmm2
19981 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1344(%rdi), %zmm13
19982 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1280(%rdi), %zmm27
19983 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1472(%rdi), %zmm20
19984 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 1408(%rdi), %zmm10
19985 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm17
19986 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm7
19987 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm9
19988 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19989 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm12
19990 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm6
19991 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
19992 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm28
19993 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm23
19994 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm4
19995 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [0,8,16,24,0,8,16,24,0,8,16,24,0,8,16,24]
19996 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
19997 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm1
19998 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm16
19999 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
20000 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
20001 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm4
20002 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %al
20003 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
20004 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
20005 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm1
20006 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm15
20007 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1
20008 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm12
20009 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9
20010 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm12
20011 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm25
20012 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20013 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20014 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20015 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1
20016 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm6
20017 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20018 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
20019 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4
20020 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm4
20021 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm29
20022 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
20023 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm1
20024 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
20025 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm1
20026 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm30
20027 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
20028 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
20029 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm12
20030 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13
20031 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20032 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20033 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20034 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14
20035 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20036 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
20037 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm22, %zmm0, %zmm1
20038 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm17
20039 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20040 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4
20041 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
20042 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
20043 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20044 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
20045 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1
20046 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm2 # 64-byte Reload
20047 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
20048 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm24
20049 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm12
20050 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20051 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm5
20052 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20053 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20054 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20055 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm1
20056 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
20057 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22
20058 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
20059 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm4
20060 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
20061 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
20062 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20063 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm5
20064 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20065 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm1
20066 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm7
20067 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm19
20068 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0
20069 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20070 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
20071 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
20072 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20073 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [1,9,17,25,1,9,17,25,1,9,17,25,1,9,17,25]
20074 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20075 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1
20076 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
20077 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4
20078 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20079 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4
20080 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
20081 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20082 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
20083 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm31
20084 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm1
20085 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm12
20086 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20087 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm30
20088 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm12
20089 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20090 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20091 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20092 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20093 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1
20094 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
20095 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
20096 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20097 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
20098 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
20099 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20100 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm1
20101 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
20102 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm1
20103 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20104 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm12
20105 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm12
20106 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20107 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20108 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20109 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1
20110 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm1
20111 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4
20112 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
20113 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
20114 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm1
20115 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20116 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
20117 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
20118 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12
20119 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20120 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20121 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20122 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20123 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1
20124 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
20125 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
20126 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20127 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
20128 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
20129 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
20130 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1
20131 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
20132 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1
20133 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm7, %zmm0
20134 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
20135 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
20136 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20137 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [2,10,18,26,2,10,18,26,2,10,18,26,2,10,18,26]
20138 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20139 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
20140 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1
20141 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
20142 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4
20143 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm4
20144 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
20145 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm8
20146 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm10
20147 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3],ymm8[4,5,6,7]
20148 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20149 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20150 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm1
20151 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
20152 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
20153 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
20154 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
20155 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm13, %zmm0, %zmm15
20156 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm16
20157 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm9
20158 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm29
20159 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3],ymm15[4,5,6,7]
20160 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20161 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20162 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20163 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
20164 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
20165 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
20166 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20167 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4
20168 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm7
20169 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm4
20170 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
20171 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
20172 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1
20173 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm25
20174 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
20175 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm3 # 64-byte Reload
20176 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
20177 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm15
20178 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm12
20179 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20180 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20181 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20182 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
20183 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1
20184 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
20185 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
20186 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
20187 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
20188 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm6
20189 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm1
20190 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm1
20191 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm24
20192 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
20193 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0
20194 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
20195 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
20196 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20197 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [3,11,19,27,3,11,19,27,3,11,19,27,3,11,19,27]
20198 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20199 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1
20200 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
20201 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4
20202 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20203 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
20204 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
20205 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20206 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm21
20207 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1
20208 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20209 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
20210 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20211 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20212 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20213 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
20214 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1
20215 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20216 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
20217 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
20218 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
20219 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm31
20220 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
20221 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20222 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
20223 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20224 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
20225 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20226 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20227 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20228 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
20229 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
20230 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm4
20231 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4
20232 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm26
20233 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
20234 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1
20235 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, %zmm9
20236 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
20237 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
20238 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12
20239 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20240 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20241 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20242 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm1
20243 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
20244 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, %zmm25
20245 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
20246 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
20247 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
20248 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
20249 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm1
20250 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm18
20251 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm24, %zmm0, %zmm1
20252 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm19, %zmm14, %zmm0
20253 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm24
20254 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm13
20255 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
20256 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
20257 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20258 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [4,12,20,28,4,12,20,28,4,12,20,28,4,12,20,28]
20259 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20260 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm1
20261 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20262 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm1
20263 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20264 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm4
20265 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm3
20266 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm4
20267 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
20268 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
20269 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1
20270 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6
20271 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20272 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm21, %zmm0, %zmm1
20273 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20274 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20275 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
20276 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20277 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20278 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20279 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm1
20280 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm23, %zmm0, %zmm1
20281 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
20282 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm4
20283 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
20284 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20285 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
20286 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
20287 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20288 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
20289 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm29, %zmm11
20290 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
20291 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20292 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20293 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20294 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 # 64-byte Reload
20295 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1
20296 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm10, %zmm0, %zmm1
20297 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm31
20298 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
20299 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4
20300 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm19
20301 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm4
20302 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
20303 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
20304 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm9, %zmm0, %zmm1
20305 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm26
20306 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm12 # 64-byte Reload
20307 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm15, %zmm0, %zmm12
20308 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm29
20309 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20310 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20311 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20312 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20313 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
20314 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1
20315 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
20316 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
20317 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14
20318 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm5, %zmm0, %zmm4
20319 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
20320 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
20321 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1
20322 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
20323 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm5
20324 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm13
20325 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20326 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm24, %zmm5, %zmm0
20327 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
20328 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
20329 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20330 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [5,13,21,29,5,13,21,29,5,13,21,29,5,13,21,29]
20331 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20332 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm17
20333 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm27
20334 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1}
20335 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm1
20336 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm1
20337 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20338 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm30, %zmm0, %zmm12
20339 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20340 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm27, %zmm1
20341 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20342 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
20343 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm1
20344 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20345 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm1
20346 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm4
20347 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
20348 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm4
20349 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
20350 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm1
20351 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm15
20352 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm1
20353 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm8
20354 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
20355 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm11, %zmm0, %zmm12
20356 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm2
20357 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20358 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20359 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20360 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm1
20361 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm31, %zmm0, %zmm1
20362 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm16
20363 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm4
20364 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm27
20365 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm19, %zmm0, %zmm4
20366 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm20
20367 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
20368 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
20369 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm1
20370 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm26, %zmm0, %zmm1
20371 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, %zmm17
20372 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 (%rsp), %zmm6 # 64-byte Reload
20373 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12
20374 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm29, %zmm0, %zmm12
20375 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3],ymm1[4,5,6,7]
20376 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm4, %zmm1
20377 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20378 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm1
20379 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm1
20380 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm4
20381 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm4
20382 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1}
20383 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm1
20384 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm1
20385 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm13, %zmm5, %zmm0
20386 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
20387 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
20388 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20389 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [6,14,22,30,6,14,22,30,6,14,22,30,6,14,22,30]
20390 ; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20391 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm4
20392 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, %zmm30
20393 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm30
20394 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [7,15,23,31,7,15,23,31,7,15,23,31,7,15,23,31]
20395 ; AVX512DQ-BW-FCP-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
20396 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm4
20397 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm19
20398 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, %zmm11
20399 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm1, %zmm11
20400 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm7, %zmm0, %zmm28
20401 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm9
20402 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm1, %zmm9
20403 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm8, %zmm0, %zmm15
20404 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20405 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20406 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm8
20407 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm8
20408 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
20409 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20410 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm31
20411 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm1, %zmm31
20412 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm16, %zmm0, %zmm23
20413 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm16
20414 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm29
20415 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm1, %zmm29
20416 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm20, %zmm0, %zmm27
20417 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, %zmm26
20418 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm2
20419 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm5
20420 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm1, %zmm5
20421 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm17, %zmm0, %zmm2
20422 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
20423 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4
20424 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20425 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm4
20426 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm6
20427 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15
20428 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20429 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm23
20430 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20431 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm23
20432 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
20433 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm17
20434 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20435 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm24
20436 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20437 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm1, %zmm24
20438 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm2, %zmm0, %zmm3
20439 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm27
20440 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
20441 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13
20442 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
20443 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm1, %zmm13
20444 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm3, %zmm0, %zmm12
20445 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
20446 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm3
20447 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20448 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm1, %zmm3
20449 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm10
20450 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm6
20451 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm20
20452 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm1, %zmm20
20453 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm25, %zmm0, %zmm6
20454 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm21
20455 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, %zmm25
20456 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm1, %zmm25
20457 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm14, %zmm0, %zmm22
20458 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
20459 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm14
20460 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm1, %zmm14
20461 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm18, %zmm0, %zmm7
20462 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
20463 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20464 ; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm6, %zmm2, %zmm1
20465 ; AVX512DQ-BW-FCP-NEXT: vpermt2d %zmm6, %zmm0, %zmm2
20466 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6
20467 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm13[4,5,6,7]
20468 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, %zmm24 {%k1}
20469 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
20470 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3],ymm9[4,5,6,7]
20471 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm30, %zmm11 {%k1}
20472 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
20473 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm5[4,5,6,7]
20474 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm31, %zmm29 {%k1}
20475 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm29, %zmm3
20476 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, %zmm25 {%k1}
20477 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm14[4,5,6,7]
20478 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm25, %zmm1
20479 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, %zmm28 {%k1}
20480 ; AVX512DQ-BW-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
20481 ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm4 # 32-byte Folded Reload
20482 ; AVX512DQ-BW-FCP-NEXT: # ymm4 = ymm4[0,1,2,3],mem[4,5,6,7]
20483 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm28, %zmm4
20484 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, %zmm26 {%k1}
20485 ; AVX512DQ-BW-FCP-NEXT: vpblendd $240, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm5 # 32-byte Folded Reload
20486 ; AVX512DQ-BW-FCP-NEXT: # ymm5 = ymm15[0,1,2,3],mem[4,5,6,7]
20487 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm5, %zmm26, %zmm5
20488 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm27 {%k1}
20489 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm12[4,5,6,7]
20490 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm27, %zmm9
20491 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, %zmm22 {%k1}
20492 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm6[0,1,2,3],ymm7[4,5,6,7]
20493 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm22, %zmm7
20494 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20495 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 192(%rsi)
20496 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20497 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 128(%rsi)
20498 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
20499 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm8, 64(%rsi)
20500 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20501 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, (%rsi)
20502 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20503 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 192(%rdx)
20504 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20505 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, (%rdx)
20506 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20507 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 64(%rdx)
20508 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20509 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 128(%rdx)
20510 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20511 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 192(%rcx)
20512 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20513 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, (%rcx)
20514 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20515 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 64(%rcx)
20516 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20517 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 128(%rcx)
20518 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20519 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 192(%r8)
20520 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20521 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, (%r8)
20522 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20523 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 64(%r8)
20524 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20525 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 128(%r8)
20526 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20527 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 192(%r9)
20528 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20529 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, (%r9)
20530 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20531 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 64(%r9)
20532 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20533 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 128(%r9)
20534 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
20535 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20536 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 192(%rax)
20537 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20538 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, (%rax)
20539 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20540 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 64(%rax)
20541 ; AVX512DQ-BW-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
20542 ; AVX512DQ-BW-FCP-NEXT: vmovaps %zmm6, 128(%rax)
20543 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
20544 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax)
20545 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
20546 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax)
20547 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax)
20548 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
20549 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 128(%rax)
20550 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 192(%rax)
20551 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax)
20552 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%rax)
20553 ; AVX512DQ-BW-FCP-NEXT: addq $3144, %rsp # imm = 0xC48
20554 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
20555 ; AVX512DQ-BW-FCP-NEXT: retq
20556 %wide.vec = load <512 x i32>, ptr %in.vec, align 64
20557 %strided.vec0 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120, i32 128, i32 136, i32 144, i32 152, i32 160, i32 168, i32 176, i32 184, i32 192, i32 200, i32 208, i32 216, i32 224, i32 232, i32 240, i32 248, i32 256, i32 264, i32 272, i32 280, i32 288, i32 296, i32 304, i32 312, i32 320, i32 328, i32 336, i32 344, i32 352, i32 360, i32 368, i32 376, i32 384, i32 392, i32 400, i32 408, i32 416, i32 424, i32 432, i32 440, i32 448, i32 456, i32 464, i32 472, i32 480, i32 488, i32 496, i32 504>
20558 %strided.vec1 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121, i32 129, i32 137, i32 145, i32 153, i32 161, i32 169, i32 177, i32 185, i32 193, i32 201, i32 209, i32 217, i32 225, i32 233, i32 241, i32 249, i32 257, i32 265, i32 273, i32 281, i32 289, i32 297, i32 305, i32 313, i32 321, i32 329, i32 337, i32 345, i32 353, i32 361, i32 369, i32 377, i32 385, i32 393, i32 401, i32 409, i32 417, i32 425, i32 433, i32 441, i32 449, i32 457, i32 465, i32 473, i32 481, i32 489, i32 497, i32 505>
20559 %strided.vec2 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 66, i32 74, i32 82, i32 90, i32 98, i32 106, i32 114, i32 122, i32 130, i32 138, i32 146, i32 154, i32 162, i32 170, i32 178, i32 186, i32 194, i32 202, i32 210, i32 218, i32 226, i32 234, i32 242, i32 250, i32 258, i32 266, i32 274, i32 282, i32 290, i32 298, i32 306, i32 314, i32 322, i32 330, i32 338, i32 346, i32 354, i32 362, i32 370, i32 378, i32 386, i32 394, i32 402, i32 410, i32 418, i32 426, i32 434, i32 442, i32 450, i32 458, i32 466, i32 474, i32 482, i32 490, i32 498, i32 506>
20560 %strided.vec3 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 67, i32 75, i32 83, i32 91, i32 99, i32 107, i32 115, i32 123, i32 131, i32 139, i32 147, i32 155, i32 163, i32 171, i32 179, i32 187, i32 195, i32 203, i32 211, i32 219, i32 227, i32 235, i32 243, i32 251, i32 259, i32 267, i32 275, i32 283, i32 291, i32 299, i32 307, i32 315, i32 323, i32 331, i32 339, i32 347, i32 355, i32 363, i32 371, i32 379, i32 387, i32 395, i32 403, i32 411, i32 419, i32 427, i32 435, i32 443, i32 451, i32 459, i32 467, i32 475, i32 483, i32 491, i32 499, i32 507>
20561 %strided.vec4 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 68, i32 76, i32 84, i32 92, i32 100, i32 108, i32 116, i32 124, i32 132, i32 140, i32 148, i32 156, i32 164, i32 172, i32 180, i32 188, i32 196, i32 204, i32 212, i32 220, i32 228, i32 236, i32 244, i32 252, i32 260, i32 268, i32 276, i32 284, i32 292, i32 300, i32 308, i32 316, i32 324, i32 332, i32 340, i32 348, i32 356, i32 364, i32 372, i32 380, i32 388, i32 396, i32 404, i32 412, i32 420, i32 428, i32 436, i32 444, i32 452, i32 460, i32 468, i32 476, i32 484, i32 492, i32 500, i32 508>
20562 %strided.vec5 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125, i32 133, i32 141, i32 149, i32 157, i32 165, i32 173, i32 181, i32 189, i32 197, i32 205, i32 213, i32 221, i32 229, i32 237, i32 245, i32 253, i32 261, i32 269, i32 277, i32 285, i32 293, i32 301, i32 309, i32 317, i32 325, i32 333, i32 341, i32 349, i32 357, i32 365, i32 373, i32 381, i32 389, i32 397, i32 405, i32 413, i32 421, i32 429, i32 437, i32 445, i32 453, i32 461, i32 469, i32 477, i32 485, i32 493, i32 501, i32 509>
20563 %strided.vec6 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126, i32 134, i32 142, i32 150, i32 158, i32 166, i32 174, i32 182, i32 190, i32 198, i32 206, i32 214, i32 222, i32 230, i32 238, i32 246, i32 254, i32 262, i32 270, i32 278, i32 286, i32 294, i32 302, i32 310, i32 318, i32 326, i32 334, i32 342, i32 350, i32 358, i32 366, i32 374, i32 382, i32 390, i32 398, i32 406, i32 414, i32 422, i32 430, i32 438, i32 446, i32 454, i32 462, i32 470, i32 478, i32 486, i32 494, i32 502, i32 510>
20564 %strided.vec7 = shufflevector <512 x i32> %wide.vec, <512 x i32> poison, <64 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127, i32 135, i32 143, i32 151, i32 159, i32 167, i32 175, i32 183, i32 191, i32 199, i32 207, i32 215, i32 223, i32 231, i32 239, i32 247, i32 255, i32 263, i32 271, i32 279, i32 287, i32 295, i32 303, i32 311, i32 319, i32 327, i32 335, i32 343, i32 351, i32 359, i32 367, i32 375, i32 383, i32 391, i32 399, i32 407, i32 415, i32 423, i32 431, i32 439, i32 447, i32 455, i32 463, i32 471, i32 479, i32 487, i32 495, i32 503, i32 511>
20565 store <64 x i32> %strided.vec0, ptr %out.vec0, align 64
20566 store <64 x i32> %strided.vec1, ptr %out.vec1, align 64
20567 store <64 x i32> %strided.vec2, ptr %out.vec2, align 64
20568 store <64 x i32> %strided.vec3, ptr %out.vec3, align 64
20569 store <64 x i32> %strided.vec4, ptr %out.vec4, align 64
20570 store <64 x i32> %strided.vec5, ptr %out.vec5, align 64
20571 store <64 x i32> %strided.vec6, ptr %out.vec6, align 64
20572 store <64 x i32> %strided.vec7, ptr %out.vec7, align 64