1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
16 ; These patterns are produced by LoopVectorizer for interleaved loads.
18 define void @load_i16_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
19 ; SSE-LABEL: load_i16_stride8_vf2:
21 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
22 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
23 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
24 ; SSE-NEXT: movdqa (%rdi), %xmm0
25 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
26 ; SSE-NEXT: movdqa %xmm0, %xmm2
27 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
28 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
29 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
30 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[3,3,3,3]
31 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
32 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
33 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
34 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
35 ; SSE-NEXT: movd %xmm2, (%rsi)
36 ; SSE-NEXT: movd %xmm3, (%rdx)
37 ; SSE-NEXT: movd %xmm4, (%rcx)
38 ; SSE-NEXT: movd %xmm5, (%r8)
39 ; SSE-NEXT: movd %xmm0, (%r9)
40 ; SSE-NEXT: movd %xmm1, (%r11)
41 ; SSE-NEXT: movd %xmm6, (%r10)
42 ; SSE-NEXT: movd %xmm7, (%rax)
45 ; AVX-LABEL: load_i16_stride8_vf2:
47 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
48 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
49 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11
50 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
51 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
52 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
53 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
54 ; AVX-NEXT: vmovd %xmm2, (%rsi)
55 ; AVX-NEXT: vpextrd $1, %xmm2, (%rdx)
56 ; AVX-NEXT: vpextrd $2, %xmm2, (%rcx)
57 ; AVX-NEXT: vpextrd $3, %xmm2, (%r8)
58 ; AVX-NEXT: vmovd %xmm0, (%r9)
59 ; AVX-NEXT: vpextrd $1, %xmm0, (%r11)
60 ; AVX-NEXT: vpextrd $2, %xmm0, (%r10)
61 ; AVX-NEXT: vpextrd $3, %xmm0, (%rax)
64 ; AVX2-LABEL: load_i16_stride8_vf2:
66 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
67 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
68 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
69 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
70 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
71 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
72 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
73 ; AVX2-NEXT: vmovd %xmm2, (%rsi)
74 ; AVX2-NEXT: vpextrd $1, %xmm2, (%rdx)
75 ; AVX2-NEXT: vpextrd $2, %xmm2, (%rcx)
76 ; AVX2-NEXT: vpextrd $3, %xmm2, (%r8)
77 ; AVX2-NEXT: vmovd %xmm0, (%r9)
78 ; AVX2-NEXT: vpextrd $1, %xmm0, (%r11)
79 ; AVX2-NEXT: vpextrd $2, %xmm0, (%r10)
80 ; AVX2-NEXT: vpextrd $3, %xmm0, (%rax)
83 ; AVX2-FP-LABEL: load_i16_stride8_vf2:
85 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
86 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
87 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11
88 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
89 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
90 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
91 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
92 ; AVX2-FP-NEXT: vmovd %xmm2, (%rsi)
93 ; AVX2-FP-NEXT: vpextrd $1, %xmm2, (%rdx)
94 ; AVX2-FP-NEXT: vpextrd $2, %xmm2, (%rcx)
95 ; AVX2-FP-NEXT: vpextrd $3, %xmm2, (%r8)
96 ; AVX2-FP-NEXT: vmovd %xmm0, (%r9)
97 ; AVX2-FP-NEXT: vpextrd $1, %xmm0, (%r11)
98 ; AVX2-FP-NEXT: vpextrd $2, %xmm0, (%r10)
99 ; AVX2-FP-NEXT: vpextrd $3, %xmm0, (%rax)
102 ; AVX2-FCP-LABEL: load_i16_stride8_vf2:
104 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
105 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
106 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
107 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
108 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
109 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
110 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
111 ; AVX2-FCP-NEXT: vmovd %xmm2, (%rsi)
112 ; AVX2-FCP-NEXT: vpextrd $1, %xmm2, (%rdx)
113 ; AVX2-FCP-NEXT: vpextrd $2, %xmm2, (%rcx)
114 ; AVX2-FCP-NEXT: vpextrd $3, %xmm2, (%r8)
115 ; AVX2-FCP-NEXT: vmovd %xmm0, (%r9)
116 ; AVX2-FCP-NEXT: vpextrd $1, %xmm0, (%r11)
117 ; AVX2-FCP-NEXT: vpextrd $2, %xmm0, (%r10)
118 ; AVX2-FCP-NEXT: vpextrd $3, %xmm0, (%rax)
119 ; AVX2-FCP-NEXT: retq
121 ; AVX512-LABEL: load_i16_stride8_vf2:
123 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
124 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
125 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
126 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
127 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
128 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
129 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
130 ; AVX512-NEXT: vmovd %xmm2, (%rsi)
131 ; AVX512-NEXT: vpextrd $1, %xmm2, (%rdx)
132 ; AVX512-NEXT: vpextrd $2, %xmm2, (%rcx)
133 ; AVX512-NEXT: vpextrd $3, %xmm2, (%r8)
134 ; AVX512-NEXT: vmovd %xmm0, (%r9)
135 ; AVX512-NEXT: vpextrd $1, %xmm0, (%r11)
136 ; AVX512-NEXT: vpextrd $2, %xmm0, (%r10)
137 ; AVX512-NEXT: vpextrd $3, %xmm0, (%rax)
140 ; AVX512-FCP-LABEL: load_i16_stride8_vf2:
141 ; AVX512-FCP: # %bb.0:
142 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
143 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
144 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
145 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
146 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
147 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
148 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
149 ; AVX512-FCP-NEXT: vmovd %xmm2, (%rsi)
150 ; AVX512-FCP-NEXT: vpextrd $1, %xmm2, (%rdx)
151 ; AVX512-FCP-NEXT: vpextrd $2, %xmm2, (%rcx)
152 ; AVX512-FCP-NEXT: vpextrd $3, %xmm2, (%r8)
153 ; AVX512-FCP-NEXT: vmovd %xmm0, (%r9)
154 ; AVX512-FCP-NEXT: vpextrd $1, %xmm0, (%r11)
155 ; AVX512-FCP-NEXT: vpextrd $2, %xmm0, (%r10)
156 ; AVX512-FCP-NEXT: vpextrd $3, %xmm0, (%rax)
157 ; AVX512-FCP-NEXT: retq
159 ; AVX512DQ-LABEL: load_i16_stride8_vf2:
161 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
162 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
163 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
164 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
165 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
166 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
167 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
168 ; AVX512DQ-NEXT: vmovd %xmm2, (%rsi)
169 ; AVX512DQ-NEXT: vpextrd $1, %xmm2, (%rdx)
170 ; AVX512DQ-NEXT: vpextrd $2, %xmm2, (%rcx)
171 ; AVX512DQ-NEXT: vpextrd $3, %xmm2, (%r8)
172 ; AVX512DQ-NEXT: vmovd %xmm0, (%r9)
173 ; AVX512DQ-NEXT: vpextrd $1, %xmm0, (%r11)
174 ; AVX512DQ-NEXT: vpextrd $2, %xmm0, (%r10)
175 ; AVX512DQ-NEXT: vpextrd $3, %xmm0, (%rax)
176 ; AVX512DQ-NEXT: retq
178 ; AVX512DQ-FCP-LABEL: load_i16_stride8_vf2:
179 ; AVX512DQ-FCP: # %bb.0:
180 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
181 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
182 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
183 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
184 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
185 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
186 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
187 ; AVX512DQ-FCP-NEXT: vmovd %xmm2, (%rsi)
188 ; AVX512DQ-FCP-NEXT: vpextrd $1, %xmm2, (%rdx)
189 ; AVX512DQ-FCP-NEXT: vpextrd $2, %xmm2, (%rcx)
190 ; AVX512DQ-FCP-NEXT: vpextrd $3, %xmm2, (%r8)
191 ; AVX512DQ-FCP-NEXT: vmovd %xmm0, (%r9)
192 ; AVX512DQ-FCP-NEXT: vpextrd $1, %xmm0, (%r11)
193 ; AVX512DQ-FCP-NEXT: vpextrd $2, %xmm0, (%r10)
194 ; AVX512DQ-FCP-NEXT: vpextrd $3, %xmm0, (%rax)
195 ; AVX512DQ-FCP-NEXT: retq
197 ; AVX512BW-LABEL: load_i16_stride8_vf2:
199 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
200 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
201 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
202 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
203 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
204 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
205 ; AVX512BW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
206 ; AVX512BW-NEXT: vmovd %xmm2, (%rsi)
207 ; AVX512BW-NEXT: vpextrd $1, %xmm2, (%rdx)
208 ; AVX512BW-NEXT: vpextrd $2, %xmm2, (%rcx)
209 ; AVX512BW-NEXT: vpextrd $3, %xmm2, (%r8)
210 ; AVX512BW-NEXT: vmovd %xmm0, (%r9)
211 ; AVX512BW-NEXT: vpextrd $1, %xmm0, (%r11)
212 ; AVX512BW-NEXT: vpextrd $2, %xmm0, (%r10)
213 ; AVX512BW-NEXT: vpextrd $3, %xmm0, (%rax)
214 ; AVX512BW-NEXT: retq
216 ; AVX512BW-FCP-LABEL: load_i16_stride8_vf2:
217 ; AVX512BW-FCP: # %bb.0:
218 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
219 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
220 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
221 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
222 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
223 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
224 ; AVX512BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
225 ; AVX512BW-FCP-NEXT: vmovd %xmm2, (%rsi)
226 ; AVX512BW-FCP-NEXT: vpextrd $1, %xmm2, (%rdx)
227 ; AVX512BW-FCP-NEXT: vpextrd $2, %xmm2, (%rcx)
228 ; AVX512BW-FCP-NEXT: vpextrd $3, %xmm2, (%r8)
229 ; AVX512BW-FCP-NEXT: vmovd %xmm0, (%r9)
230 ; AVX512BW-FCP-NEXT: vpextrd $1, %xmm0, (%r11)
231 ; AVX512BW-FCP-NEXT: vpextrd $2, %xmm0, (%r10)
232 ; AVX512BW-FCP-NEXT: vpextrd $3, %xmm0, (%rax)
233 ; AVX512BW-FCP-NEXT: retq
235 ; AVX512DQ-BW-LABEL: load_i16_stride8_vf2:
236 ; AVX512DQ-BW: # %bb.0:
237 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
238 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
239 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
240 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
241 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1
242 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
243 ; AVX512DQ-BW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
244 ; AVX512DQ-BW-NEXT: vmovd %xmm2, (%rsi)
245 ; AVX512DQ-BW-NEXT: vpextrd $1, %xmm2, (%rdx)
246 ; AVX512DQ-BW-NEXT: vpextrd $2, %xmm2, (%rcx)
247 ; AVX512DQ-BW-NEXT: vpextrd $3, %xmm2, (%r8)
248 ; AVX512DQ-BW-NEXT: vmovd %xmm0, (%r9)
249 ; AVX512DQ-BW-NEXT: vpextrd $1, %xmm0, (%r11)
250 ; AVX512DQ-BW-NEXT: vpextrd $2, %xmm0, (%r10)
251 ; AVX512DQ-BW-NEXT: vpextrd $3, %xmm0, (%rax)
252 ; AVX512DQ-BW-NEXT: retq
254 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride8_vf2:
255 ; AVX512DQ-BW-FCP: # %bb.0:
256 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
257 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
258 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
259 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
260 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
261 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
262 ; AVX512DQ-BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
263 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm2, (%rsi)
264 ; AVX512DQ-BW-FCP-NEXT: vpextrd $1, %xmm2, (%rdx)
265 ; AVX512DQ-BW-FCP-NEXT: vpextrd $2, %xmm2, (%rcx)
266 ; AVX512DQ-BW-FCP-NEXT: vpextrd $3, %xmm2, (%r8)
267 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%r9)
268 ; AVX512DQ-BW-FCP-NEXT: vpextrd $1, %xmm0, (%r11)
269 ; AVX512DQ-BW-FCP-NEXT: vpextrd $2, %xmm0, (%r10)
270 ; AVX512DQ-BW-FCP-NEXT: vpextrd $3, %xmm0, (%rax)
271 ; AVX512DQ-BW-FCP-NEXT: retq
272 %wide.vec = load <16 x i16>, ptr %in.vec, align 64
273 %strided.vec0 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <2 x i32> <i32 0, i32 8>
274 %strided.vec1 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <2 x i32> <i32 1, i32 9>
275 %strided.vec2 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <2 x i32> <i32 2, i32 10>
276 %strided.vec3 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <2 x i32> <i32 3, i32 11>
277 %strided.vec4 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <2 x i32> <i32 4, i32 12>
278 %strided.vec5 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <2 x i32> <i32 5, i32 13>
279 %strided.vec6 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <2 x i32> <i32 6, i32 14>
280 %strided.vec7 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <2 x i32> <i32 7, i32 15>
281 store <2 x i16> %strided.vec0, ptr %out.vec0, align 64
282 store <2 x i16> %strided.vec1, ptr %out.vec1, align 64
283 store <2 x i16> %strided.vec2, ptr %out.vec2, align 64
284 store <2 x i16> %strided.vec3, ptr %out.vec3, align 64
285 store <2 x i16> %strided.vec4, ptr %out.vec4, align 64
286 store <2 x i16> %strided.vec5, ptr %out.vec5, align 64
287 store <2 x i16> %strided.vec6, ptr %out.vec6, align 64
288 store <2 x i16> %strided.vec7, ptr %out.vec7, align 64
292 define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
293 ; SSE-LABEL: load_i16_stride8_vf4:
295 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
296 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
297 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
298 ; SSE-NEXT: movdqa (%rdi), %xmm0
299 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
300 ; SSE-NEXT: movdqa 32(%rdi), %xmm2
301 ; SSE-NEXT: movdqa 48(%rdi), %xmm3
302 ; SSE-NEXT: movdqa %xmm2, %xmm4
303 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
304 ; SSE-NEXT: movdqa %xmm0, %xmm5
305 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
306 ; SSE-NEXT: movdqa %xmm5, %xmm6
307 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
308 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1]
309 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,1,1]
310 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
311 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[3,3,3,3]
312 ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
313 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3]
314 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
315 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
316 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
317 ; SSE-NEXT: movdqa %xmm0, %xmm1
318 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
319 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
320 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
321 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
322 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
323 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
324 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
325 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
326 ; SSE-NEXT: movq %xmm6, (%rsi)
327 ; SSE-NEXT: movq %xmm8, (%rdx)
328 ; SSE-NEXT: movq %xmm5, (%rcx)
329 ; SSE-NEXT: movq %xmm7, (%r8)
330 ; SSE-NEXT: movq %xmm1, (%r9)
331 ; SSE-NEXT: movq %xmm4, (%r11)
332 ; SSE-NEXT: movq %xmm0, (%r10)
333 ; SSE-NEXT: movq %xmm3, (%rax)
336 ; AVX-LABEL: load_i16_stride8_vf4:
338 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
339 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
340 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11
341 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
342 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
343 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2
344 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm3
345 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
346 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
347 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
348 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
349 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7]
350 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
351 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
352 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
353 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
354 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
355 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
356 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
357 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
358 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7]
359 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
360 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
361 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
362 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
363 ; AVX-NEXT: vmovq %xmm6, (%rsi)
364 ; AVX-NEXT: vmovq %xmm7, (%rdx)
365 ; AVX-NEXT: vmovq %xmm8, (%rcx)
366 ; AVX-NEXT: vmovq %xmm4, (%r8)
367 ; AVX-NEXT: vmovq %xmm1, (%r9)
368 ; AVX-NEXT: vmovq %xmm3, (%r11)
369 ; AVX-NEXT: vmovq %xmm5, (%r10)
370 ; AVX-NEXT: vmovq %xmm0, (%rax)
373 ; AVX2-LABEL: load_i16_stride8_vf4:
375 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
376 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
377 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
378 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
379 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
380 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
381 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3
382 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
383 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
384 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
385 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
386 ; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
387 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
388 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
389 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
390 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
391 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
392 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
393 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
394 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
395 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
396 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
397 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
398 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
399 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
400 ; AVX2-NEXT: vmovq %xmm6, (%rsi)
401 ; AVX2-NEXT: vmovq %xmm7, (%rdx)
402 ; AVX2-NEXT: vmovq %xmm8, (%rcx)
403 ; AVX2-NEXT: vmovq %xmm4, (%r8)
404 ; AVX2-NEXT: vmovq %xmm1, (%r9)
405 ; AVX2-NEXT: vmovq %xmm3, (%r11)
406 ; AVX2-NEXT: vmovq %xmm5, (%r10)
407 ; AVX2-NEXT: vmovq %xmm0, (%rax)
410 ; AVX2-FP-LABEL: load_i16_stride8_vf4:
412 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
413 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
414 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11
415 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
416 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
417 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2
418 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm3
419 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
420 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
421 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
422 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
423 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
424 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
425 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
426 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
427 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
428 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
429 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
430 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
431 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
432 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
433 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
434 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
435 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
436 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
437 ; AVX2-FP-NEXT: vmovq %xmm6, (%rsi)
438 ; AVX2-FP-NEXT: vmovq %xmm7, (%rdx)
439 ; AVX2-FP-NEXT: vmovq %xmm8, (%rcx)
440 ; AVX2-FP-NEXT: vmovq %xmm4, (%r8)
441 ; AVX2-FP-NEXT: vmovq %xmm1, (%r9)
442 ; AVX2-FP-NEXT: vmovq %xmm3, (%r11)
443 ; AVX2-FP-NEXT: vmovq %xmm5, (%r10)
444 ; AVX2-FP-NEXT: vmovq %xmm0, (%rax)
447 ; AVX2-FCP-LABEL: load_i16_stride8_vf4:
449 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
450 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
451 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
452 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
453 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
454 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
455 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
456 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
457 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
458 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
459 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
460 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
461 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
462 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
463 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
464 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
465 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
466 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
467 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
468 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
469 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
470 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
471 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
472 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
473 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
474 ; AVX2-FCP-NEXT: vmovq %xmm6, (%rsi)
475 ; AVX2-FCP-NEXT: vmovq %xmm7, (%rdx)
476 ; AVX2-FCP-NEXT: vmovq %xmm8, (%rcx)
477 ; AVX2-FCP-NEXT: vmovq %xmm4, (%r8)
478 ; AVX2-FCP-NEXT: vmovq %xmm1, (%r9)
479 ; AVX2-FCP-NEXT: vmovq %xmm3, (%r11)
480 ; AVX2-FCP-NEXT: vmovq %xmm5, (%r10)
481 ; AVX2-FCP-NEXT: vmovq %xmm0, (%rax)
482 ; AVX2-FCP-NEXT: retq
484 ; AVX512-LABEL: load_i16_stride8_vf4:
486 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
487 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
488 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
489 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
490 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
491 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
492 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
493 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
494 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
495 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
496 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
497 ; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
498 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
499 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm9 = [3,7,3,3]
500 ; AVX512-NEXT: vpermt2d %xmm4, %xmm9, %xmm5
501 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
502 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
503 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
504 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
505 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
506 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
507 ; AVX512-NEXT: vpermt2d %xmm2, %xmm9, %xmm0
508 ; AVX512-NEXT: vmovq %xmm6, (%rsi)
509 ; AVX512-NEXT: vmovq %xmm7, (%rdx)
510 ; AVX512-NEXT: vmovq %xmm8, (%rcx)
511 ; AVX512-NEXT: vmovq %xmm5, (%r8)
512 ; AVX512-NEXT: vmovq %xmm1, (%r9)
513 ; AVX512-NEXT: vmovq %xmm3, (%r11)
514 ; AVX512-NEXT: vmovq %xmm4, (%r10)
515 ; AVX512-NEXT: vmovq %xmm0, (%rax)
518 ; AVX512-FCP-LABEL: load_i16_stride8_vf4:
519 ; AVX512-FCP: # %bb.0:
520 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
521 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
522 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
523 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
524 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
525 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
526 ; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
527 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
528 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
529 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
530 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,5,1,1]
531 ; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm8
532 ; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm8
533 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
534 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,3,3]
535 ; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm10, %xmm5
536 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
537 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
538 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
539 ; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm7
540 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
541 ; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm10, %xmm0
542 ; AVX512-FCP-NEXT: vmovq %xmm6, (%rsi)
543 ; AVX512-FCP-NEXT: vmovq %xmm8, (%rdx)
544 ; AVX512-FCP-NEXT: vmovq %xmm9, (%rcx)
545 ; AVX512-FCP-NEXT: vmovq %xmm5, (%r8)
546 ; AVX512-FCP-NEXT: vmovq %xmm1, (%r9)
547 ; AVX512-FCP-NEXT: vmovq %xmm7, (%r11)
548 ; AVX512-FCP-NEXT: vmovq %xmm3, (%r10)
549 ; AVX512-FCP-NEXT: vmovq %xmm0, (%rax)
550 ; AVX512-FCP-NEXT: retq
552 ; AVX512DQ-LABEL: load_i16_stride8_vf4:
554 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
555 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
556 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
557 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
558 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
559 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2
560 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm3
561 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
562 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
563 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
564 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
565 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
566 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
567 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm9 = [3,7,3,3]
568 ; AVX512DQ-NEXT: vpermt2d %xmm4, %xmm9, %xmm5
569 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
570 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
571 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
572 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
573 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
574 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
575 ; AVX512DQ-NEXT: vpermt2d %xmm2, %xmm9, %xmm0
576 ; AVX512DQ-NEXT: vmovq %xmm6, (%rsi)
577 ; AVX512DQ-NEXT: vmovq %xmm7, (%rdx)
578 ; AVX512DQ-NEXT: vmovq %xmm8, (%rcx)
579 ; AVX512DQ-NEXT: vmovq %xmm5, (%r8)
580 ; AVX512DQ-NEXT: vmovq %xmm1, (%r9)
581 ; AVX512DQ-NEXT: vmovq %xmm3, (%r11)
582 ; AVX512DQ-NEXT: vmovq %xmm4, (%r10)
583 ; AVX512DQ-NEXT: vmovq %xmm0, (%rax)
584 ; AVX512DQ-NEXT: retq
586 ; AVX512DQ-FCP-LABEL: load_i16_stride8_vf4:
587 ; AVX512DQ-FCP: # %bb.0:
588 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
589 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
590 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
591 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
592 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
593 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
594 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
595 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
596 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
597 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
598 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,5,1,1]
599 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm8
600 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm8
601 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
602 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,3,3]
603 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm10, %xmm5
604 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
605 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
606 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
607 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm7
608 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
609 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm10, %xmm0
610 ; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%rsi)
611 ; AVX512DQ-FCP-NEXT: vmovq %xmm8, (%rdx)
612 ; AVX512DQ-FCP-NEXT: vmovq %xmm9, (%rcx)
613 ; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8)
614 ; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r9)
615 ; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r11)
616 ; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%r10)
617 ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax)
618 ; AVX512DQ-FCP-NEXT: retq
620 ; AVX512BW-LABEL: load_i16_stride8_vf4:
622 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
623 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
624 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
625 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
626 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
627 ; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2
628 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
629 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [1,9,17,25,0,0,0,0]
630 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
631 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [2,10,18,26,0,0,0,0]
632 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
633 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [3,11,19,27,0,0,0,0]
634 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5
635 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [4,12,20,28,0,0,0,0]
636 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6
637 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [5,13,21,29,0,0,0,0]
638 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7
639 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm8 = [6,14,22,30,0,0,0,0]
640 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8
641 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm9 = [7,15,23,31,0,0,0,0]
642 ; AVX512BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm9
643 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
644 ; AVX512BW-NEXT: vmovq %xmm3, (%rdx)
645 ; AVX512BW-NEXT: vmovq %xmm4, (%rcx)
646 ; AVX512BW-NEXT: vmovq %xmm5, (%r8)
647 ; AVX512BW-NEXT: vmovq %xmm6, (%r9)
648 ; AVX512BW-NEXT: vmovq %xmm7, (%r11)
649 ; AVX512BW-NEXT: vmovq %xmm8, (%r10)
650 ; AVX512BW-NEXT: vmovq %xmm9, (%rax)
651 ; AVX512BW-NEXT: vzeroupper
652 ; AVX512BW-NEXT: retq
654 ; AVX512BW-FCP-LABEL: load_i16_stride8_vf4:
655 ; AVX512BW-FCP: # %bb.0:
656 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
657 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
658 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
659 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
660 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
661 ; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
662 ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
663 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,9,17,25,0,0,0,0]
664 ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
665 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [2,10,18,26,0,0,0,0]
666 ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
667 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,11,19,27,0,0,0,0]
668 ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm5
669 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [4,12,20,28,0,0,0,0]
670 ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm6
671 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [5,13,21,29,0,0,0,0]
672 ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm7
673 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [6,14,22,30,0,0,0,0]
674 ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm8
675 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm9 = [7,15,23,31,0,0,0,0]
676 ; AVX512BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm9
677 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi)
678 ; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rdx)
679 ; AVX512BW-FCP-NEXT: vmovq %xmm4, (%rcx)
680 ; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r8)
681 ; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r9)
682 ; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r11)
683 ; AVX512BW-FCP-NEXT: vmovq %xmm8, (%r10)
684 ; AVX512BW-FCP-NEXT: vmovq %xmm9, (%rax)
685 ; AVX512BW-FCP-NEXT: vzeroupper
686 ; AVX512BW-FCP-NEXT: retq
688 ; AVX512DQ-BW-LABEL: load_i16_stride8_vf4:
689 ; AVX512DQ-BW: # %bb.0:
690 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
691 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
692 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
693 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
694 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm1
695 ; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2
696 ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
697 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [1,9,17,25,0,0,0,0]
698 ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
699 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [2,10,18,26,0,0,0,0]
700 ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
701 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [3,11,19,27,0,0,0,0]
702 ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm5
703 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [4,12,20,28,0,0,0,0]
704 ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm6
705 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm7 = [5,13,21,29,0,0,0,0]
706 ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm7
707 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm8 = [6,14,22,30,0,0,0,0]
708 ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm8
709 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm9 = [7,15,23,31,0,0,0,0]
710 ; AVX512DQ-BW-NEXT: vpermi2w %ymm2, %ymm1, %ymm9
711 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi)
712 ; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rdx)
713 ; AVX512DQ-BW-NEXT: vmovq %xmm4, (%rcx)
714 ; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r8)
715 ; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r9)
716 ; AVX512DQ-BW-NEXT: vmovq %xmm7, (%r11)
717 ; AVX512DQ-BW-NEXT: vmovq %xmm8, (%r10)
718 ; AVX512DQ-BW-NEXT: vmovq %xmm9, (%rax)
719 ; AVX512DQ-BW-NEXT: vzeroupper
720 ; AVX512DQ-BW-NEXT: retq
722 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride8_vf4:
723 ; AVX512DQ-BW-FCP: # %bb.0:
724 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
725 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
726 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
727 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
728 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
729 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2
730 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm0
731 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [1,9,17,25,0,0,0,0]
732 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm3
733 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [2,10,18,26,0,0,0,0]
734 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm4
735 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [3,11,19,27,0,0,0,0]
736 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm5
737 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [4,12,20,28,0,0,0,0]
738 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm6
739 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [5,13,21,29,0,0,0,0]
740 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm7
741 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [6,14,22,30,0,0,0,0]
742 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm8
743 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm9 = [7,15,23,31,0,0,0,0]
744 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm2, %ymm1, %ymm9
745 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi)
746 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rdx)
747 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%rcx)
748 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r8)
749 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r9)
750 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r11)
751 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm8, (%r10)
752 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm9, (%rax)
753 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
754 ; AVX512DQ-BW-FCP-NEXT: retq
755 %wide.vec = load <32 x i16>, ptr %in.vec, align 64
756 %strided.vec0 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
757 %strided.vec1 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <4 x i32> <i32 1, i32 9, i32 17, i32 25>
758 %strided.vec2 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
759 %strided.vec3 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
760 %strided.vec4 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <4 x i32> <i32 4, i32 12, i32 20, i32 28>
761 %strided.vec5 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <4 x i32> <i32 5, i32 13, i32 21, i32 29>
762 %strided.vec6 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <4 x i32> <i32 6, i32 14, i32 22, i32 30>
763 %strided.vec7 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <4 x i32> <i32 7, i32 15, i32 23, i32 31>
764 store <4 x i16> %strided.vec0, ptr %out.vec0, align 64
765 store <4 x i16> %strided.vec1, ptr %out.vec1, align 64
766 store <4 x i16> %strided.vec2, ptr %out.vec2, align 64
767 store <4 x i16> %strided.vec3, ptr %out.vec3, align 64
768 store <4 x i16> %strided.vec4, ptr %out.vec4, align 64
769 store <4 x i16> %strided.vec5, ptr %out.vec5, align 64
770 store <4 x i16> %strided.vec6, ptr %out.vec6, align 64
771 store <4 x i16> %strided.vec7, ptr %out.vec7, align 64
775 define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
776 ; SSE-LABEL: load_i16_stride8_vf8:
778 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
779 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
780 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
781 ; SSE-NEXT: movdqa (%rdi), %xmm0
782 ; SSE-NEXT: movdqa 16(%rdi), %xmm7
783 ; SSE-NEXT: movdqa 32(%rdi), %xmm2
784 ; SSE-NEXT: movdqa 48(%rdi), %xmm9
785 ; SSE-NEXT: movdqa 80(%rdi), %xmm10
786 ; SSE-NEXT: movdqa 64(%rdi), %xmm1
787 ; SSE-NEXT: movdqa 112(%rdi), %xmm11
788 ; SSE-NEXT: movdqa 96(%rdi), %xmm3
789 ; SSE-NEXT: movdqa %xmm3, %xmm13
790 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
791 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,0,0,0]
792 ; SSE-NEXT: movdqa %xmm1, %xmm12
793 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
794 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[0,0,0,0]
795 ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3]
796 ; SSE-NEXT: movdqa %xmm2, %xmm14
797 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3]
798 ; SSE-NEXT: movdqa %xmm0, %xmm4
799 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
800 ; SSE-NEXT: movdqa %xmm4, %xmm5
801 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1]
802 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
803 ; SSE-NEXT: movdqa %xmm12, %xmm8
804 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1]
805 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[1,1,1,1]
806 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,1,1]
807 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1]
808 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3]
809 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[2,2,2,2]
810 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[2,2,2,2]
811 ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm8[2],xmm15[3],xmm8[3]
812 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[3,3,3,3]
813 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm14[2],xmm4[3],xmm14[3]
814 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,3]
815 ; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3]
816 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm14[3,3,3,3]
817 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1]
818 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm12[2,3]
819 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
820 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,0,0,0]
821 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
822 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,0,0]
823 ; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm11[2],xmm10[3],xmm11[3]
824 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
825 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
826 ; SSE-NEXT: movdqa %xmm0, %xmm7
827 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
828 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3]
829 ; SSE-NEXT: movdqa %xmm1, %xmm9
830 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1]
831 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,1,1]
832 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,1,1]
833 ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
834 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm9[2,3]
835 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[2,2,2,2]
836 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[2,2,2,2]
837 ; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
838 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[3,3,3,3]
839 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
840 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3]
841 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
842 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
843 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1]
844 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,3]
845 ; SSE-NEXT: movaps %xmm5, (%rsi)
846 ; SSE-NEXT: movaps %xmm6, (%rdx)
847 ; SSE-NEXT: movaps %xmm4, (%rcx)
848 ; SSE-NEXT: movaps %xmm8, (%r8)
849 ; SSE-NEXT: movaps %xmm7, (%r9)
850 ; SSE-NEXT: movaps %xmm11, (%r11)
851 ; SSE-NEXT: movaps %xmm0, (%r10)
852 ; SSE-NEXT: movaps %xmm9, (%rax)
855 ; AVX-LABEL: load_i16_stride8_vf8:
857 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
858 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
859 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11
860 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm2
861 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm3
862 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
863 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,0,0,0]
864 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm5
865 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm6
866 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
867 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1]
868 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
869 ; AVX-NEXT: vmovdqa (%rdi), %xmm8
870 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm9
871 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm10
872 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm11
873 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
874 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
875 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
876 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
877 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
878 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,1,1,1]
879 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm12[2,3],xmm14[4,5,6,7]
880 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3],xmm1[4,5,6,7]
881 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,2,2,2]
882 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm7[0,1,2,3,4,5],xmm14[6,7]
883 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
884 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4,5,6,7]
885 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm7[2],xmm4[2],xmm7[3],xmm4[3]
886 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[2,3,2,3]
887 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[3,3,3,3]
888 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3],xmm12[4,5,6,7]
889 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4,5,6,7]
890 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
891 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
892 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
893 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1]
894 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,5],xmm3[6,7]
895 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
896 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
897 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
898 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2,3],xmm3[4,5,6,7]
899 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
900 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,1,1]
901 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm6[2,3],xmm9[4,5,6,7]
902 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4,5,6,7]
903 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,2,2,2]
904 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm5[0,1,2,3,4,5],xmm9[6,7]
905 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
906 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7]
907 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm5[2],xmm2[2],xmm5[3],xmm2[3]
908 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
909 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3]
910 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
911 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
912 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
913 ; AVX-NEXT: vmovdqa %xmm1, (%rdx)
914 ; AVX-NEXT: vmovdqa %xmm14, (%rcx)
915 ; AVX-NEXT: vmovdqa %xmm4, (%r8)
916 ; AVX-NEXT: vmovdqa %xmm3, (%r9)
917 ; AVX-NEXT: vmovdqa %xmm8, (%r11)
918 ; AVX-NEXT: vmovdqa %xmm9, (%r10)
919 ; AVX-NEXT: vmovdqa %xmm2, (%rax)
922 ; AVX2-LABEL: load_i16_stride8_vf8:
924 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
925 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
926 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
927 ; AVX2-NEXT: vmovdqa 112(%rdi), %xmm2
928 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm3
929 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
930 ; AVX2-NEXT: vpbroadcastd %xmm4, %xmm0
931 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm5
932 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm6
933 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
934 ; AVX2-NEXT: vpbroadcastd %xmm7, %xmm1
935 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
936 ; AVX2-NEXT: vmovdqa (%rdi), %xmm8
937 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm9
938 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm10
939 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm11
940 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
941 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
942 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
943 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
944 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
945 ; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,1,1,1]
946 ; AVX2-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm12[1],xmm14[2,3]
947 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
948 ; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,2,2,2]
949 ; AVX2-NEXT: vpblendd {{.*#+}} xmm14 = xmm7[0,1,2],xmm14[3]
950 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
951 ; AVX2-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
952 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm7[2],xmm4[2],xmm7[3],xmm4[3]
953 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[2,3,2,3]
954 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[3,3,3,3]
955 ; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0],xmm7[1],xmm12[2,3]
956 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3]
957 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
958 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm3
959 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
960 ; AVX2-NEXT: vpbroadcastd %xmm5, %xmm6
961 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
962 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
963 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
964 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
965 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
966 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
967 ; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,1,1]
968 ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm6[1],xmm9[2,3]
969 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
970 ; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,2,2,2]
971 ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm5[0,1,2],xmm9[3]
972 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
973 ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
974 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm5[2],xmm2[2],xmm5[3],xmm2[3]
975 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
976 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3]
977 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
978 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
979 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
980 ; AVX2-NEXT: vmovdqa %xmm1, (%rdx)
981 ; AVX2-NEXT: vmovdqa %xmm14, (%rcx)
982 ; AVX2-NEXT: vmovdqa %xmm4, (%r8)
983 ; AVX2-NEXT: vmovdqa %xmm3, (%r9)
984 ; AVX2-NEXT: vmovdqa %xmm8, (%r11)
985 ; AVX2-NEXT: vmovdqa %xmm9, (%r10)
986 ; AVX2-NEXT: vmovdqa %xmm2, (%rax)
989 ; AVX2-FP-LABEL: load_i16_stride8_vf8:
991 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
992 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
993 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11
994 ; AVX2-FP-NEXT: vmovdqa 112(%rdi), %xmm2
995 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm3
996 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
997 ; AVX2-FP-NEXT: vpbroadcastd %xmm4, %xmm0
998 ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm5
999 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm6
1000 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1001 ; AVX2-FP-NEXT: vpbroadcastd %xmm7, %xmm1
1002 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1003 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm8
1004 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm9
1005 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm10
1006 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm11
1007 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1008 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
1009 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
1010 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1011 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
1012 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,1,1,1]
1013 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm12[1],xmm14[2,3]
1014 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
1015 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,2,2,2]
1016 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm7[0,1,2],xmm14[3]
1017 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
1018 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
1019 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm7[2],xmm4[2],xmm7[3],xmm4[3]
1020 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[2,3,2,3]
1021 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[3,3,3,3]
1022 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0],xmm7[1],xmm12[2,3]
1023 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3]
1024 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
1025 ; AVX2-FP-NEXT: vpbroadcastd %xmm2, %xmm3
1026 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
1027 ; AVX2-FP-NEXT: vpbroadcastd %xmm5, %xmm6
1028 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
1029 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
1030 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
1031 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
1032 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
1033 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
1034 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,1,1]
1035 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm6[1],xmm9[2,3]
1036 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
1037 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,2,2,2]
1038 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm5[0,1,2],xmm9[3]
1039 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1040 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
1041 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm5[2],xmm2[2],xmm5[3],xmm2[3]
1042 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
1043 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3]
1044 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
1045 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
1046 ; AVX2-FP-NEXT: vmovdqa %xmm0, (%rsi)
1047 ; AVX2-FP-NEXT: vmovdqa %xmm1, (%rdx)
1048 ; AVX2-FP-NEXT: vmovdqa %xmm14, (%rcx)
1049 ; AVX2-FP-NEXT: vmovdqa %xmm4, (%r8)
1050 ; AVX2-FP-NEXT: vmovdqa %xmm3, (%r9)
1051 ; AVX2-FP-NEXT: vmovdqa %xmm8, (%r11)
1052 ; AVX2-FP-NEXT: vmovdqa %xmm9, (%r10)
1053 ; AVX2-FP-NEXT: vmovdqa %xmm2, (%rax)
1054 ; AVX2-FP-NEXT: retq
1056 ; AVX2-FCP-LABEL: load_i16_stride8_vf8:
1057 ; AVX2-FCP: # %bb.0:
1058 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1059 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1060 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1061 ; AVX2-FCP-NEXT: vmovdqa 112(%rdi), %xmm2
1062 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm3
1063 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1064 ; AVX2-FCP-NEXT: vpbroadcastd %xmm4, %xmm0
1065 ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm5
1066 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm6
1067 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1068 ; AVX2-FCP-NEXT: vpbroadcastd %xmm7, %xmm1
1069 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1070 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm8
1071 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm9
1072 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm10
1073 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm11
1074 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1075 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
1076 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
1077 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1078 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
1079 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,1,1,1]
1080 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm12[1],xmm14[2,3]
1081 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
1082 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,2,2,2]
1083 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm7[0,1,2],xmm14[3]
1084 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
1085 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
1086 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm7[2],xmm4[2],xmm7[3],xmm4[3]
1087 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[2,3,2,3]
1088 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[3,3,3,3]
1089 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0],xmm7[1],xmm12[2,3]
1090 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3]
1091 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
1092 ; AVX2-FCP-NEXT: vpbroadcastd %xmm2, %xmm3
1093 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
1094 ; AVX2-FCP-NEXT: vpbroadcastd %xmm5, %xmm6
1095 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
1096 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
1097 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
1098 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
1099 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
1100 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
1101 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,1,1]
1102 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm6[1],xmm9[2,3]
1103 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
1104 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,2,2,2]
1105 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm5[0,1,2],xmm9[3]
1106 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1107 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
1108 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm5[2],xmm2[2],xmm5[3],xmm2[3]
1109 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
1110 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3]
1111 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
1112 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
1113 ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rsi)
1114 ; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rdx)
1115 ; AVX2-FCP-NEXT: vmovdqa %xmm14, (%rcx)
1116 ; AVX2-FCP-NEXT: vmovdqa %xmm4, (%r8)
1117 ; AVX2-FCP-NEXT: vmovdqa %xmm3, (%r9)
1118 ; AVX2-FCP-NEXT: vmovdqa %xmm8, (%r11)
1119 ; AVX2-FCP-NEXT: vmovdqa %xmm9, (%r10)
1120 ; AVX2-FCP-NEXT: vmovdqa %xmm2, (%rax)
1121 ; AVX2-FCP-NEXT: retq
1123 ; AVX512-LABEL: load_i16_stride8_vf8:
1125 ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm1
1126 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm2
1127 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1128 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm3
1129 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm4
1130 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1131 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4]
1132 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1133 ; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2]
1134 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3]
1135 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1136 ; AVX512-NEXT: vpermt2d %xmm5, %xmm0, %xmm6
1137 ; AVX512-NEXT: vmovdqa (%rdi), %xmm5
1138 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm10
1139 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm11
1140 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm12
1141 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
1142 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3]
1143 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
1144 ; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3]
1145 ; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,1,1]
1146 ; AVX512-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm13[1],xmm15[2,3]
1147 ; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3]
1148 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
1149 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3]
1150 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm15 = [3,7,0,0]
1151 ; AVX512-NEXT: vpermt2d %xmm13, %xmm15, %xmm14
1152 ; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
1153 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1154 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1155 ; AVX512-NEXT: vpermi2d %xmm1, %xmm2, %xmm0
1156 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
1157 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
1158 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
1159 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
1160 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1161 ; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[1,1,1,1]
1162 ; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm3[1],xmm10[2,3]
1163 ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3]
1164 ; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,2,2,2]
1165 ; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm2[0,1,2],xmm10[3]
1166 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1167 ; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
1168 ; AVX512-NEXT: vpermt2d %xmm3, %xmm15, %xmm4
1169 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1170 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
1171 ; AVX512-NEXT: vmovdqa %xmm6, (%rsi)
1172 ; AVX512-NEXT: vmovdqa %xmm7, (%rdx)
1173 ; AVX512-NEXT: vmovdqa %xmm8, (%rcx)
1174 ; AVX512-NEXT: vmovdqa %xmm9, (%r8)
1175 ; AVX512-NEXT: vmovdqa %xmm0, (%r9)
1176 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
1177 ; AVX512-NEXT: vmovdqa %xmm5, (%rax)
1178 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
1179 ; AVX512-NEXT: vmovdqa %xmm10, (%rax)
1180 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
1181 ; AVX512-NEXT: vmovdqa %xmm1, (%rax)
1184 ; AVX512-FCP-LABEL: load_i16_stride8_vf8:
1185 ; AVX512-FCP: # %bb.0:
1186 ; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
1187 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm4
1188 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
1189 ; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm17
1190 ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm5
1191 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm6
1192 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1193 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,0,0,4]
1194 ; AVX512-FCP-NEXT: vmovdqa %xmm11, %xmm0
1195 ; AVX512-FCP-NEXT: vpermt2d %xmm10, %xmm3, %xmm0
1196 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm7
1197 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm8
1198 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm12
1199 ; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm13
1200 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
1201 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
1202 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
1203 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
1204 ; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm16
1205 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [1,5,0,0]
1206 ; AVX512-FCP-NEXT: vmovdqa %xmm15, %xmm2
1207 ; AVX512-FCP-NEXT: vpermt2d %xmm14, %xmm9, %xmm2
1208 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
1209 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
1210 ; AVX512-FCP-NEXT: vmovdqa %xmm11, %xmm0
1211 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
1212 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,2,6]
1213 ; AVX512-FCP-NEXT: vpermt2d %xmm10, %xmm1, %xmm0
1214 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
1215 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
1216 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,0,0]
1217 ; AVX512-FCP-NEXT: vpermt2d %xmm14, %xmm10, %xmm15
1218 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
1219 ; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm14
1220 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
1221 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
1222 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
1223 ; AVX512-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm3
1224 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
1225 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
1226 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
1227 ; AVX512-FCP-NEXT: vpermi2d %xmm6, %xmm7, %xmm9
1228 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
1229 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
1230 ; AVX512-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm1
1231 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1232 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
1233 ; AVX512-FCP-NEXT: vpermt2d %xmm6, %xmm10, %xmm7
1234 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
1235 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3]
1236 ; AVX512-FCP-NEXT: vmovdqa64 %xmm16, (%rsi)
1237 ; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rdx)
1238 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rcx)
1239 ; AVX512-FCP-NEXT: vmovdqa %xmm11, (%r8)
1240 ; AVX512-FCP-NEXT: vmovdqa %xmm3, (%r9)
1241 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1242 ; AVX512-FCP-NEXT: vmovdqa %xmm8, (%rax)
1243 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1244 ; AVX512-FCP-NEXT: vmovdqa %xmm1, (%rax)
1245 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1246 ; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rax)
1247 ; AVX512-FCP-NEXT: retq
1249 ; AVX512DQ-LABEL: load_i16_stride8_vf8:
1250 ; AVX512DQ: # %bb.0:
1251 ; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm1
1252 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm2
1253 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1254 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm3
1255 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm4
1256 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1257 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4]
1258 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1259 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2]
1260 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3]
1261 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1262 ; AVX512DQ-NEXT: vpermt2d %xmm5, %xmm0, %xmm6
1263 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm5
1264 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm10
1265 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm11
1266 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm12
1267 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
1268 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3]
1269 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
1270 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3]
1271 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,1,1]
1272 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm13[1],xmm15[2,3]
1273 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3]
1274 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
1275 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3]
1276 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm15 = [3,7,0,0]
1277 ; AVX512DQ-NEXT: vpermt2d %xmm13, %xmm15, %xmm14
1278 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
1279 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1280 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1281 ; AVX512DQ-NEXT: vpermi2d %xmm1, %xmm2, %xmm0
1282 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
1283 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
1284 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
1285 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
1286 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1287 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[1,1,1,1]
1288 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm3[1],xmm10[2,3]
1289 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3]
1290 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,2,2,2]
1291 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm2[0,1,2],xmm10[3]
1292 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1293 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
1294 ; AVX512DQ-NEXT: vpermt2d %xmm3, %xmm15, %xmm4
1295 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1296 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
1297 ; AVX512DQ-NEXT: vmovdqa %xmm6, (%rsi)
1298 ; AVX512DQ-NEXT: vmovdqa %xmm7, (%rdx)
1299 ; AVX512DQ-NEXT: vmovdqa %xmm8, (%rcx)
1300 ; AVX512DQ-NEXT: vmovdqa %xmm9, (%r8)
1301 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%r9)
1302 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
1303 ; AVX512DQ-NEXT: vmovdqa %xmm5, (%rax)
1304 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
1305 ; AVX512DQ-NEXT: vmovdqa %xmm10, (%rax)
1306 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
1307 ; AVX512DQ-NEXT: vmovdqa %xmm1, (%rax)
1308 ; AVX512DQ-NEXT: retq
1310 ; AVX512DQ-FCP-LABEL: load_i16_stride8_vf8:
1311 ; AVX512DQ-FCP: # %bb.0:
1312 ; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
1313 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm4
1314 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
1315 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm17
1316 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm5
1317 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm6
1318 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1319 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,0,0,4]
1320 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, %xmm0
1321 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm10, %xmm3, %xmm0
1322 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm7
1323 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm8
1324 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm12
1325 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm13
1326 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
1327 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
1328 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
1329 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
1330 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm16
1331 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [1,5,0,0]
1332 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm15, %xmm2
1333 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm14, %xmm9, %xmm2
1334 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
1335 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
1336 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, %xmm0
1337 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
1338 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,2,6]
1339 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm10, %xmm1, %xmm0
1340 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
1341 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
1342 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,0,0]
1343 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm14, %xmm10, %xmm15
1344 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
1345 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm14
1346 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
1347 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
1348 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
1349 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm3
1350 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
1351 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
1352 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
1353 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm6, %xmm7, %xmm9
1354 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
1355 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
1356 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm1
1357 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1358 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
1359 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm6, %xmm10, %xmm7
1360 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
1361 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3]
1362 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, (%rsi)
1363 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rdx)
1364 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rcx)
1365 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, (%r8)
1366 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%r9)
1367 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1368 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, (%rax)
1369 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1370 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%rax)
1371 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1372 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rax)
1373 ; AVX512DQ-FCP-NEXT: retq
1375 ; AVX512BW-LABEL: load_i16_stride8_vf8:
1376 ; AVX512BW: # %bb.0:
1377 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1378 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
1379 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
1380 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56]
1381 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
1382 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2
1383 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm0
1384 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57]
1385 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3
1386 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58]
1387 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm4
1388 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59]
1389 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm5
1390 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60]
1391 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm6
1392 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61]
1393 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm7
1394 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62]
1395 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm8
1396 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63]
1397 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9
1398 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
1399 ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx)
1400 ; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx)
1401 ; AVX512BW-NEXT: vmovdqa %xmm5, (%r8)
1402 ; AVX512BW-NEXT: vmovdqa %xmm6, (%r9)
1403 ; AVX512BW-NEXT: vmovdqa %xmm7, (%r11)
1404 ; AVX512BW-NEXT: vmovdqa %xmm8, (%r10)
1405 ; AVX512BW-NEXT: vmovdqa %xmm9, (%rax)
1406 ; AVX512BW-NEXT: vzeroupper
1407 ; AVX512BW-NEXT: retq
1409 ; AVX512BW-FCP-LABEL: load_i16_stride8_vf8:
1410 ; AVX512BW-FCP: # %bb.0:
1411 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1412 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1413 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1414 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56]
1415 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
1416 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
1417 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm0
1418 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57]
1419 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm3
1420 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58]
1421 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm4
1422 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59]
1423 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm5
1424 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60]
1425 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm6
1426 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61]
1427 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm7
1428 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62]
1429 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm8
1430 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63]
1431 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm9
1432 ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi)
1433 ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx)
1434 ; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rcx)
1435 ; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%r8)
1436 ; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%r9)
1437 ; AVX512BW-FCP-NEXT: vmovdqa %xmm7, (%r11)
1438 ; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%r10)
1439 ; AVX512BW-FCP-NEXT: vmovdqa %xmm9, (%rax)
1440 ; AVX512BW-FCP-NEXT: vzeroupper
1441 ; AVX512BW-FCP-NEXT: retq
1443 ; AVX512DQ-BW-LABEL: load_i16_stride8_vf8:
1444 ; AVX512DQ-BW: # %bb.0:
1445 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1446 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
1447 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
1448 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56]
1449 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1
1450 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2
1451 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm0
1452 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57]
1453 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3
1454 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58]
1455 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm4
1456 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59]
1457 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm5
1458 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60]
1459 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm6
1460 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61]
1461 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm7
1462 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62]
1463 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm8
1464 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63]
1465 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9
1466 ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rsi)
1467 ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx)
1468 ; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rcx)
1469 ; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%r8)
1470 ; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%r9)
1471 ; AVX512DQ-BW-NEXT: vmovdqa %xmm7, (%r11)
1472 ; AVX512DQ-BW-NEXT: vmovdqa %xmm8, (%r10)
1473 ; AVX512DQ-BW-NEXT: vmovdqa %xmm9, (%rax)
1474 ; AVX512DQ-BW-NEXT: vzeroupper
1475 ; AVX512DQ-BW-NEXT: retq
1477 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride8_vf8:
1478 ; AVX512DQ-BW-FCP: # %bb.0:
1479 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1480 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1481 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1482 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56]
1483 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
1484 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
1485 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm0
1486 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57]
1487 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm3
1488 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58]
1489 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm4
1490 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59]
1491 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm5
1492 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60]
1493 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm6
1494 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61]
1495 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm7
1496 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62]
1497 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm8
1498 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63]
1499 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm9
1500 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi)
1501 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx)
1502 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rcx)
1503 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%r8)
1504 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%r9)
1505 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm7, (%r11)
1506 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%r10)
1507 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm9, (%rax)
1508 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
1509 ; AVX512DQ-BW-FCP-NEXT: retq
1510 %wide.vec = load <64 x i16>, ptr %in.vec, align 64
1511 %strided.vec0 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
1512 %strided.vec1 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <8 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57>
1513 %strided.vec2 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <8 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58>
1514 %strided.vec3 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <8 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59>
1515 %strided.vec4 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <8 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60>
1516 %strided.vec5 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <8 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61>
1517 %strided.vec6 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <8 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62>
1518 %strided.vec7 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <8 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63>
1519 store <8 x i16> %strided.vec0, ptr %out.vec0, align 64
1520 store <8 x i16> %strided.vec1, ptr %out.vec1, align 64
1521 store <8 x i16> %strided.vec2, ptr %out.vec2, align 64
1522 store <8 x i16> %strided.vec3, ptr %out.vec3, align 64
1523 store <8 x i16> %strided.vec4, ptr %out.vec4, align 64
1524 store <8 x i16> %strided.vec5, ptr %out.vec5, align 64
1525 store <8 x i16> %strided.vec6, ptr %out.vec6, align 64
1526 store <8 x i16> %strided.vec7, ptr %out.vec7, align 64
1530 define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
1531 ; SSE-LABEL: load_i16_stride8_vf16:
1533 ; SSE-NEXT: subq $168, %rsp
1534 ; SSE-NEXT: movdqa 112(%rdi), %xmm6
1535 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1536 ; SSE-NEXT: movdqa 96(%rdi), %xmm14
1537 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1538 ; SSE-NEXT: movdqa 208(%rdi), %xmm0
1539 ; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
1540 ; SSE-NEXT: movdqa 192(%rdi), %xmm5
1541 ; SSE-NEXT: movdqa 240(%rdi), %xmm1
1542 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1543 ; SSE-NEXT: movdqa 224(%rdi), %xmm12
1544 ; SSE-NEXT: movdqa 144(%rdi), %xmm3
1545 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1546 ; SSE-NEXT: movdqa 128(%rdi), %xmm10
1547 ; SSE-NEXT: movdqa 176(%rdi), %xmm2
1548 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1549 ; SSE-NEXT: movdqa 160(%rdi), %xmm11
1550 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1551 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3]
1552 ; SSE-NEXT: movdqa %xmm10, %xmm2
1553 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1554 ; SSE-NEXT: movdqa %xmm12, %xmm4
1555 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
1556 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0]
1557 ; SSE-NEXT: movdqa %xmm5, %xmm1
1558 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1559 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
1560 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1561 ; SSE-NEXT: movdqa %xmm2, %xmm3
1562 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1]
1563 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
1564 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1565 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
1566 ; SSE-NEXT: movdqa 80(%rdi), %xmm0
1567 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1568 ; SSE-NEXT: movdqa 64(%rdi), %xmm3
1569 ; SSE-NEXT: movdqa %xmm3, %xmm7
1570 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
1571 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,0,0,0]
1572 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[0,0,0,0]
1573 ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm6[2],xmm15[3],xmm6[3]
1574 ; SSE-NEXT: movdqa 32(%rdi), %xmm9
1575 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1576 ; SSE-NEXT: movdqa 48(%rdi), %xmm0
1577 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1578 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
1579 ; SSE-NEXT: movdqa (%rdi), %xmm6
1580 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
1581 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1582 ; SSE-NEXT: movdqa %xmm6, %xmm8
1583 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
1584 ; SSE-NEXT: movdqa %xmm8, %xmm0
1585 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
1586 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3]
1587 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1588 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
1589 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm11[1,1,1,1]
1590 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1]
1591 ; SSE-NEXT: movdqa %xmm1, %xmm15
1592 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1]
1593 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3]
1594 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1595 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm9[1,1,1,1]
1596 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
1597 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1]
1598 ; SSE-NEXT: movdqa %xmm7, %xmm15
1599 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
1600 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3]
1601 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1602 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[2,2,2,2]
1603 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2]
1604 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3]
1605 ; SSE-NEXT: movdqa %xmm2, %xmm15
1606 ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm11[2],xmm15[3],xmm11[3]
1607 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1]
1608 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1609 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[2,2,2,2]
1610 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2]
1611 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3]
1612 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm8[3,3,3,3]
1613 ; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3]
1614 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,3]
1615 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
1616 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
1617 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3]
1618 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1619 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
1620 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1621 ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm14[2],xmm7[3],xmm14[3]
1622 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3]
1623 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
1624 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm7[2,3]
1625 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1626 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
1627 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
1628 ; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7]
1629 ; SSE-NEXT: movdqa %xmm10, %xmm15
1630 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
1631 ; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7]
1632 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
1633 ; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7]
1634 ; SSE-NEXT: movdqa %xmm5, %xmm2
1635 ; SSE-NEXT: punpckhwd (%rsp), %xmm2 # 16-byte Folded Reload
1636 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
1637 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0]
1638 ; SSE-NEXT: movdqa %xmm12, %xmm0
1639 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[0,0,0,0]
1640 ; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3]
1641 ; SSE-NEXT: movdqa %xmm15, %xmm1
1642 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1]
1643 ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1]
1644 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1645 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
1646 ; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7]
1647 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
1648 ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
1649 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0]
1650 ; SSE-NEXT: movdqa %xmm10, %xmm14
1651 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1652 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,0,0]
1653 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
1654 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1655 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
1656 ; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
1657 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
1658 ; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
1659 ; SSE-NEXT: movdqa %xmm6, %xmm1
1660 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
1661 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
1662 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[1,1,1,1]
1663 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,1,1]
1664 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
1665 ; SSE-NEXT: movdqa %xmm2, %xmm7
1666 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
1667 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3]
1668 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,1,1]
1669 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,1,1]
1670 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
1671 ; SSE-NEXT: movdqa %xmm3, %xmm9
1672 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
1673 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
1674 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[2,2,2,2]
1675 ; SSE-NEXT: movdqa %xmm0, %xmm10
1676 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,2,2,2]
1677 ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
1678 ; SSE-NEXT: movdqa %xmm15, %xmm11
1679 ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3]
1680 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm11[0],xmm9[1]
1681 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[2,2,2,2]
1682 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
1683 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3]
1684 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[3,3,3,3]
1685 ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1686 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,3]
1687 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3]
1688 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3]
1689 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm13[3,3,3,3]
1690 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
1691 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
1692 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
1693 ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
1694 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
1695 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1]
1696 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm3[2,3]
1697 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1698 ; SSE-NEXT: movaps %xmm2, (%rsi)
1699 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1700 ; SSE-NEXT: movaps %xmm3, 16(%rsi)
1701 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1702 ; SSE-NEXT: movaps %xmm2, (%rdx)
1703 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1704 ; SSE-NEXT: movaps %xmm2, 16(%rdx)
1705 ; SSE-NEXT: movaps %xmm8, (%rcx)
1706 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1707 ; SSE-NEXT: movaps %xmm2, 16(%rcx)
1708 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1709 ; SSE-NEXT: movaps %xmm2, (%r8)
1710 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1711 ; SSE-NEXT: movaps %xmm2, 16(%r8)
1712 ; SSE-NEXT: movaps %xmm1, (%r9)
1713 ; SSE-NEXT: movapd %xmm12, 16(%r9)
1714 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1715 ; SSE-NEXT: movaps %xmm7, (%rax)
1716 ; SSE-NEXT: movaps %xmm4, 16(%rax)
1717 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1718 ; SSE-NEXT: movaps %xmm6, (%rax)
1719 ; SSE-NEXT: movapd %xmm9, 16(%rax)
1720 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1721 ; SSE-NEXT: movaps %xmm0, 16(%rax)
1722 ; SSE-NEXT: movaps %xmm14, (%rax)
1723 ; SSE-NEXT: addq $168, %rsp
1726 ; AVX-LABEL: load_i16_stride8_vf16:
1728 ; AVX-NEXT: subq $152, %rsp
1729 ; AVX-NEXT: vmovdqa 176(%rdi), %xmm0
1730 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1731 ; AVX-NEXT: vmovdqa 160(%rdi), %xmm1
1732 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1733 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1734 ; AVX-NEXT: vmovdqa 144(%rdi), %xmm0
1735 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1736 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm1
1737 ; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
1738 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1739 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
1740 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1741 ; AVX-NEXT: vmovdqa 240(%rdi), %xmm1
1742 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1743 ; AVX-NEXT: vmovdqa 224(%rdi), %xmm2
1744 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1745 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1746 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,0,0,0]
1747 ; AVX-NEXT: vmovdqa 208(%rdi), %xmm2
1748 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1749 ; AVX-NEXT: vmovdqa 192(%rdi), %xmm3
1750 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1751 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1752 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,1]
1753 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
1754 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
1755 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1756 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm0
1757 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1758 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm1
1759 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1760 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1761 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,0,0,0]
1762 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm1
1763 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1764 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm15
1765 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
1766 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1]
1767 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5],xmm0[6,7]
1768 ; AVX-NEXT: vmovdqa (%rdi), %xmm14
1769 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm12
1770 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm11
1771 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm10
1772 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
1773 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
1774 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1775 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2,3],xmm6[4,5,6,7]
1776 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
1777 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1778 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
1779 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
1780 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,1,1]
1781 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7]
1782 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
1783 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
1784 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,1,1]
1785 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,3],xmm9[4,5,6,7]
1786 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
1787 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm13[4,5,6,7]
1788 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
1789 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1790 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
1791 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
1792 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[2,2,2,2]
1793 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,5],xmm9[6,7]
1794 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
1795 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7]
1796 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[2,2,2,2]
1797 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5],xmm9[6,7]
1798 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1799 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1,2,3],xmm9[4,5,6,7]
1800 ; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm6[4,5,6,7]
1801 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
1802 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,3,2,3]
1803 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3]
1804 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
1805 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
1806 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
1807 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
1808 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
1809 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1810 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
1811 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1812 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1813 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
1814 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1815 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1816 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1817 ; AVX-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
1818 ; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
1819 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
1820 ; AVX-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
1821 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1822 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
1823 ; AVX-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
1824 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1825 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
1826 ; AVX-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
1827 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,0,0]
1828 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,1,0,1]
1829 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7]
1830 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1831 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
1832 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
1833 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7]
1834 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
1835 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
1836 ; AVX-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
1837 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm9 # 16-byte Folded Reload
1838 ; AVX-NEXT: # xmm9 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7]
1839 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,0,0]
1840 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[0,1,0,1]
1841 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,5],xmm6[6,7]
1842 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
1843 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7]
1844 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
1845 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3],xmm8[4,5,6,7]
1846 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
1847 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
1848 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
1849 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[1,1,1,1]
1850 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,3],xmm10[4,5,6,7]
1851 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
1852 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7]
1853 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
1854 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[1,1,1,1]
1855 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm6[2,3],xmm11[4,5,6,7]
1856 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7]
1857 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
1858 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1859 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
1860 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[2,2,2,2]
1861 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0,1,2,3,4,5],xmm11[6,7]
1862 ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
1863 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
1864 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[2,2,2,2]
1865 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5],xmm11[6,7]
1866 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1867 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4,5,6,7]
1868 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
1869 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1870 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1871 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
1872 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1873 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1
1874 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1875 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1876 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm4[2],xmm9[3],xmm4[3]
1877 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,3,2,3]
1878 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[3,3,3,3]
1879 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7]
1880 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1881 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1882 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1883 ; AVX-NEXT: vmovaps %ymm1, (%rsi)
1884 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1885 ; AVX-NEXT: vmovaps %ymm1, (%rdx)
1886 ; AVX-NEXT: vmovaps %ymm13, (%rcx)
1887 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1888 ; AVX-NEXT: vmovaps %ymm1, (%r8)
1889 ; AVX-NEXT: vmovaps %ymm5, (%r9)
1890 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
1891 ; AVX-NEXT: vmovaps %ymm8, (%rax)
1892 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
1893 ; AVX-NEXT: vmovaps %ymm10, (%rax)
1894 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
1895 ; AVX-NEXT: vmovaps %ymm0, (%rax)
1896 ; AVX-NEXT: addq $152, %rsp
1897 ; AVX-NEXT: vzeroupper
1900 ; AVX2-LABEL: load_i16_stride8_vf16:
1902 ; AVX2-NEXT: subq $264, %rsp # imm = 0x108
1903 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm2
1904 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1905 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm5
1906 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1907 ; AVX2-NEXT: vmovdqa 112(%rdi), %xmm0
1908 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1909 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm1
1910 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1911 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1912 ; AVX2-NEXT: vpbroadcastd %xmm12, %xmm0
1913 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm1
1914 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1915 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm3
1916 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1917 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
1918 ; AVX2-NEXT: vpbroadcastd %xmm9, %xmm1
1919 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1920 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
1921 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1922 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
1923 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1924 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4
1925 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1926 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm6
1927 ; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1928 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
1929 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1930 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
1931 ; AVX2-NEXT: vpblendd {{.*#+}} xmm14 = xmm4[0,1],xmm3[2,3]
1932 ; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm5[0,1,0,2]
1933 ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[0,1,2,0,4,5,6,4]
1934 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
1935 ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm2[0,1,0,2]
1936 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[0,1,2,0,4,5,6,4]
1937 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
1938 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
1939 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm0
1940 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1941 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm1
1942 ; AVX2-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
1943 ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm1[0,1,0,2]
1944 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,2,2,3,4,6,6,7]
1945 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
1946 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,2]
1947 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[0,2,2,3,4,6,6,7]
1948 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
1949 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7]
1950 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
1951 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7]
1952 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1953 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,1,1]
1954 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3]
1955 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm12[0],xmm9[1],xmm12[1]
1956 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
1957 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
1958 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
1959 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
1960 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
1961 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
1962 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
1963 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
1964 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
1965 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1966 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[2,2,2,2]
1967 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
1968 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
1969 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1970 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[0,1,1,3,4,5,5,7]
1971 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[0,1,1,3,4,5,5,7]
1972 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
1973 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
1974 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
1975 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[3,1,2,3,7,5,6,7]
1976 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,1,2,3,7,5,6,7]
1977 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm6 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
1978 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm10 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
1979 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
1980 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
1981 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
1982 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1983 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm12[2],xmm9[3],xmm12[3]
1984 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[2,3,2,3]
1985 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3]
1986 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3]
1987 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
1988 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
1989 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
1990 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
1991 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
1992 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
1993 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
1994 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
1995 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1996 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1997 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1998 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1999 ; AVX2-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
2000 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm1
2001 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2002 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload
2003 ; AVX2-NEXT: # xmm5 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
2004 ; AVX2-NEXT: vpbroadcastd %xmm5, %xmm2
2005 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2006 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2007 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2008 ; AVX2-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
2009 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2010 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
2011 ; AVX2-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
2012 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2013 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
2014 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
2015 ; AVX2-NEXT: # ymm6 = mem[0,1,1,3]
2016 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,1,2,0,4,5,6,4]
2017 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
2018 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
2019 ; AVX2-NEXT: # ymm7 = mem[0,1,1,3]
2020 ; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[0,1,2,0,4,5,6,4]
2021 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
2022 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5,6],ymm8[7]
2023 ; AVX2-NEXT: vpermq $212, (%rsp), %ymm8 # 32-byte Folded Reload
2024 ; AVX2-NEXT: # ymm8 = mem[0,1,1,3]
2025 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
2026 ; AVX2-NEXT: # ymm9 = mem[0,1,1,3]
2027 ; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm8[0,2,2,3,4,6,6,7]
2028 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
2029 ; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[0,2,2,3,4,6,6,7]
2030 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
2031 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
2032 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2033 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
2034 ; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[1,1,1,1]
2035 ; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm1[1],xmm11[2,3]
2036 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
2037 ; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3]
2038 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
2039 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
2040 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5,6],ymm4[7]
2041 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm10 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
2042 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
2043 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7]
2044 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm4[6,7]
2045 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7]
2046 ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,2,2,2]
2047 ; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm5[0,1,2],xmm10[3]
2048 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2049 ; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
2050 ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7]
2051 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm11 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
2052 ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7]
2053 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm12 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
2054 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7]
2055 ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[3,1,2,3,7,5,6,7]
2056 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,1,2,3,7,5,6,7]
2057 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
2058 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm13 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
2059 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7]
2060 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2061 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
2062 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
2063 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
2064 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
2065 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
2066 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2067 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
2068 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
2069 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
2070 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
2071 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
2072 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7]
2073 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2074 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2075 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2076 ; AVX2-NEXT: vmovaps %ymm1, (%rsi)
2077 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2078 ; AVX2-NEXT: vmovaps %ymm1, (%rdx)
2079 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2080 ; AVX2-NEXT: vmovaps %ymm1, (%rcx)
2081 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2082 ; AVX2-NEXT: vmovaps %ymm1, (%r8)
2083 ; AVX2-NEXT: vmovdqa %ymm3, (%r9)
2084 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
2085 ; AVX2-NEXT: vmovdqa %ymm4, (%rax)
2086 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
2087 ; AVX2-NEXT: vmovdqa %ymm10, (%rax)
2088 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
2089 ; AVX2-NEXT: vmovdqa %ymm0, (%rax)
2090 ; AVX2-NEXT: addq $264, %rsp # imm = 0x108
2091 ; AVX2-NEXT: vzeroupper
2094 ; AVX2-FP-LABEL: load_i16_stride8_vf16:
2096 ; AVX2-FP-NEXT: subq $264, %rsp # imm = 0x108
2097 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm2
2098 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2099 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm5
2100 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2101 ; AVX2-FP-NEXT: vmovdqa 112(%rdi), %xmm0
2102 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2103 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm1
2104 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2105 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2106 ; AVX2-FP-NEXT: vpbroadcastd %xmm12, %xmm0
2107 ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm1
2108 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2109 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm3
2110 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2111 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
2112 ; AVX2-FP-NEXT: vpbroadcastd %xmm9, %xmm1
2113 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2114 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
2115 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2116 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
2117 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2118 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm4
2119 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2120 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm6
2121 ; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2122 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
2123 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2124 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
2125 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm4[0,1],xmm3[2,3]
2126 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm5[0,1,0,2]
2127 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[0,1,2,0,4,5,6,4]
2128 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
2129 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm2[0,1,0,2]
2130 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[0,1,2,0,4,5,6,4]
2131 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
2132 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
2133 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm0
2134 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2135 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm1
2136 ; AVX2-FP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
2137 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm1[0,1,0,2]
2138 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,2,2,3,4,6,6,7]
2139 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
2140 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,2]
2141 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[0,2,2,3,4,6,6,7]
2142 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
2143 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7]
2144 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
2145 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7]
2146 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2147 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,1,1]
2148 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3]
2149 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm12[0],xmm9[1],xmm12[1]
2150 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
2151 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
2152 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
2153 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
2154 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
2155 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
2156 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
2157 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
2158 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
2159 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2160 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[2,2,2,2]
2161 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
2162 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
2163 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2164 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[0,1,1,3,4,5,5,7]
2165 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[0,1,1,3,4,5,5,7]
2166 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
2167 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
2168 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
2169 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[3,1,2,3,7,5,6,7]
2170 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,1,2,3,7,5,6,7]
2171 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
2172 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
2173 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
2174 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
2175 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
2176 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2177 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm12[2],xmm9[3],xmm12[3]
2178 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[2,3,2,3]
2179 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3]
2180 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3]
2181 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
2182 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
2183 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
2184 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
2185 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
2186 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
2187 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
2188 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2189 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2190 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2191 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2192 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2193 ; AVX2-FP-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
2194 ; AVX2-FP-NEXT: vpbroadcastd %xmm0, %xmm1
2195 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2196 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload
2197 ; AVX2-FP-NEXT: # xmm5 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
2198 ; AVX2-FP-NEXT: vpbroadcastd %xmm5, %xmm2
2199 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2200 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2201 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2202 ; AVX2-FP-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
2203 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2204 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
2205 ; AVX2-FP-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
2206 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2207 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
2208 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
2209 ; AVX2-FP-NEXT: # ymm6 = mem[0,1,1,3]
2210 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,1,2,0,4,5,6,4]
2211 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
2212 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
2213 ; AVX2-FP-NEXT: # ymm7 = mem[0,1,1,3]
2214 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[0,1,2,0,4,5,6,4]
2215 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
2216 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5,6],ymm8[7]
2217 ; AVX2-FP-NEXT: vpermq $212, (%rsp), %ymm8 # 32-byte Folded Reload
2218 ; AVX2-FP-NEXT: # ymm8 = mem[0,1,1,3]
2219 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
2220 ; AVX2-FP-NEXT: # ymm9 = mem[0,1,1,3]
2221 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm8[0,2,2,3,4,6,6,7]
2222 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
2223 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[0,2,2,3,4,6,6,7]
2224 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
2225 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
2226 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2227 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
2228 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[1,1,1,1]
2229 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm1[1],xmm11[2,3]
2230 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
2231 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3]
2232 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
2233 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
2234 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5,6],ymm4[7]
2235 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
2236 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
2237 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7]
2238 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm4[6,7]
2239 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7]
2240 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,2,2,2]
2241 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm5[0,1,2],xmm10[3]
2242 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2243 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
2244 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7]
2245 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
2246 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7]
2247 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
2248 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7]
2249 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[3,1,2,3,7,5,6,7]
2250 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,1,2,3,7,5,6,7]
2251 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
2252 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
2253 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7]
2254 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2255 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
2256 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
2257 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
2258 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
2259 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
2260 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2261 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
2262 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
2263 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
2264 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
2265 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
2266 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7]
2267 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2268 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2269 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2270 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi)
2271 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2272 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx)
2273 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2274 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rcx)
2275 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2276 ; AVX2-FP-NEXT: vmovaps %ymm1, (%r8)
2277 ; AVX2-FP-NEXT: vmovdqa %ymm3, (%r9)
2278 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2279 ; AVX2-FP-NEXT: vmovdqa %ymm4, (%rax)
2280 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2281 ; AVX2-FP-NEXT: vmovdqa %ymm10, (%rax)
2282 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2283 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rax)
2284 ; AVX2-FP-NEXT: addq $264, %rsp # imm = 0x108
2285 ; AVX2-FP-NEXT: vzeroupper
2286 ; AVX2-FP-NEXT: retq
2288 ; AVX2-FCP-LABEL: load_i16_stride8_vf16:
2289 ; AVX2-FCP: # %bb.0:
2290 ; AVX2-FCP-NEXT: subq $264, %rsp # imm = 0x108
2291 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm2
2292 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2293 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm5
2294 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2295 ; AVX2-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
2296 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2297 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
2298 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2299 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2300 ; AVX2-FCP-NEXT: vpbroadcastd %xmm12, %xmm0
2301 ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm1
2302 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2303 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm3
2304 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2305 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
2306 ; AVX2-FCP-NEXT: vpbroadcastd %xmm9, %xmm1
2307 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2308 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
2309 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2310 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
2311 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2312 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm4
2313 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2314 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm6
2315 ; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2316 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
2317 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2318 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
2319 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm4[0,1],xmm3[2,3]
2320 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm5[0,1,0,2]
2321 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[0,1,2,0,4,5,6,4]
2322 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
2323 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm2[0,1,0,2]
2324 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[0,1,2,0,4,5,6,4]
2325 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
2326 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
2327 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm0
2328 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2329 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1
2330 ; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
2331 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm1[0,1,0,2]
2332 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,2,2,3,4,6,6,7]
2333 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
2334 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,2]
2335 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[0,2,2,3,4,6,6,7]
2336 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
2337 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7]
2338 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
2339 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7]
2340 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2341 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,1,1]
2342 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3]
2343 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm12[0],xmm9[1],xmm12[1]
2344 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
2345 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
2346 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
2347 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
2348 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
2349 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
2350 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
2351 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
2352 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
2353 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2354 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[2,2,2,2]
2355 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
2356 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
2357 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2358 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[0,1,1,3,4,5,5,7]
2359 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[0,1,1,3,4,5,5,7]
2360 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
2361 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
2362 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
2363 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[3,1,2,3,7,5,6,7]
2364 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,1,2,3,7,5,6,7]
2365 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
2366 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
2367 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
2368 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
2369 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
2370 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2371 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm12[2],xmm9[3],xmm12[3]
2372 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[2,3,2,3]
2373 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3]
2374 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3]
2375 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
2376 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
2377 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
2378 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
2379 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
2380 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
2381 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
2382 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2383 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2384 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2385 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2386 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2387 ; AVX2-FCP-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
2388 ; AVX2-FCP-NEXT: vpbroadcastd %xmm0, %xmm1
2389 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2390 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload
2391 ; AVX2-FCP-NEXT: # xmm5 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
2392 ; AVX2-FCP-NEXT: vpbroadcastd %xmm5, %xmm2
2393 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2394 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2395 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2396 ; AVX2-FCP-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
2397 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2398 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
2399 ; AVX2-FCP-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
2400 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2401 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
2402 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
2403 ; AVX2-FCP-NEXT: # ymm6 = mem[0,1,1,3]
2404 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,1,2,0,4,5,6,4]
2405 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
2406 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
2407 ; AVX2-FCP-NEXT: # ymm7 = mem[0,1,1,3]
2408 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[0,1,2,0,4,5,6,4]
2409 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
2410 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5,6],ymm8[7]
2411 ; AVX2-FCP-NEXT: vpermq $212, (%rsp), %ymm8 # 32-byte Folded Reload
2412 ; AVX2-FCP-NEXT: # ymm8 = mem[0,1,1,3]
2413 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
2414 ; AVX2-FCP-NEXT: # ymm9 = mem[0,1,1,3]
2415 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm8[0,2,2,3,4,6,6,7]
2416 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
2417 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[0,2,2,3,4,6,6,7]
2418 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
2419 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
2420 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2421 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
2422 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[1,1,1,1]
2423 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm1[1],xmm11[2,3]
2424 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
2425 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3]
2426 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
2427 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
2428 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5,6],ymm4[7]
2429 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
2430 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
2431 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7]
2432 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm4[6,7]
2433 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7]
2434 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,2,2,2]
2435 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm5[0,1,2],xmm10[3]
2436 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2437 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
2438 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7]
2439 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
2440 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7]
2441 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
2442 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7]
2443 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[3,1,2,3,7,5,6,7]
2444 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,1,2,3,7,5,6,7]
2445 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
2446 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
2447 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7]
2448 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2449 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
2450 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
2451 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
2452 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
2453 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
2454 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2455 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
2456 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
2457 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
2458 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
2459 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
2460 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7]
2461 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2462 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2463 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2464 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi)
2465 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2466 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx)
2467 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2468 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rcx)
2469 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2470 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%r8)
2471 ; AVX2-FCP-NEXT: vmovdqa %ymm3, (%r9)
2472 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2473 ; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rax)
2474 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2475 ; AVX2-FCP-NEXT: vmovdqa %ymm10, (%rax)
2476 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2477 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax)
2478 ; AVX2-FCP-NEXT: addq $264, %rsp # imm = 0x108
2479 ; AVX2-FCP-NEXT: vzeroupper
2480 ; AVX2-FCP-NEXT: retq
2482 ; AVX512-LABEL: load_i16_stride8_vf16:
2484 ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm0
2485 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm1
2486 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2487 ; AVX512-NEXT: vmovdqa64 %xmm1, %xmm24
2488 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm25
2489 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm0
2490 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm1
2491 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2492 ; AVX512-NEXT: vmovdqa64 %xmm1, %xmm26
2493 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm27
2494 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4]
2495 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm29 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
2496 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
2497 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
2498 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm30
2499 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
2500 ; AVX512-NEXT: vpermt2d %xmm5, %xmm7, %xmm6
2501 ; AVX512-NEXT: vmovdqa (%rdi), %xmm9
2502 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm10
2503 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm12
2504 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm13
2505 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
2506 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
2507 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm14[0],xmm4[0],xmm14[1],xmm4[1]
2508 ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
2509 ; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm18
2510 ; AVX512-NEXT: vpermq {{.*#+}} ymm17 = ymm18[0,1,0,2]
2511 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[0,1,2,0,4,5,6,4]
2512 ; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm20
2513 ; AVX512-NEXT: vpermq {{.*#+}} ymm19 = ymm20[0,1,0,2]
2514 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
2515 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[0,1,2,0,4,5,6,4]
2516 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
2517 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7]
2518 ; AVX512-NEXT: vmovdqa64 160(%rdi), %ymm23
2519 ; AVX512-NEXT: vpermq {{.*#+}} ymm21 = ymm23[0,1,0,2]
2520 ; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm16
2521 ; AVX512-NEXT: vpermq {{.*#+}} ymm22 = ymm16[0,1,0,2]
2522 ; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm21[0,2,2,3,4,6,6,7]
2523 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
2524 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[0,2,2,3,4,6,6,7]
2525 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
2526 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5],ymm8[6,7]
2527 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
2528 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
2529 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm28
2530 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
2531 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
2532 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
2533 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
2534 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
2535 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
2536 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
2537 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,1,1]
2538 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
2539 ; AVX512-NEXT: vmovdqa64 %xmm29, %xmm2
2540 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
2541 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2542 ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm29
2543 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,1,1,3,4,5,5,7]
2544 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
2545 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,1,1,3,4,5,5,7]
2546 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
2547 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
2548 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[3,1,2,3,7,5,6,7]
2549 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
2550 ; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm22[3,1,2,3,7,5,6,7]
2551 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
2552 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7]
2553 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
2554 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm14[2],xmm4[2],xmm14[3],xmm4[3]
2555 ; AVX512-NEXT: vmovdqa64 %xmm30, %xmm6
2556 ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
2557 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
2558 ; AVX512-NEXT: vmovdqa64 %ymm1, %ymm19
2559 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0]
2560 ; AVX512-NEXT: vpermt2d %xmm4, %xmm17, %xmm14
2561 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm11[2,3]
2562 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
2563 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
2564 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
2565 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
2566 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
2567 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
2568 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
2569 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2570 ; AVX512-NEXT: vmovdqa64 %xmm24, %xmm0
2571 ; AVX512-NEXT: vmovdqa64 %xmm25, %xmm1
2572 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2573 ; AVX512-NEXT: vmovdqa64 %xmm26, %xmm0
2574 ; AVX512-NEXT: vmovdqa64 %xmm27, %xmm2
2575 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2576 ; AVX512-NEXT: vpermi2d %xmm1, %xmm2, %xmm7
2577 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
2578 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
2579 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2580 ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm7[2,3]
2581 ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm18[0,1,1,3]
2582 ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[0,1,2,0,4,5,6,4]
2583 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm10 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
2584 ; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm20[0,1,1,3]
2585 ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm11[0,1,2,0,4,5,6,4]
2586 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
2587 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5,6],ymm10[7]
2588 ; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm23[0,1,1,3]
2589 ; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,2,2,3,4,6,6,7]
2590 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
2591 ; AVX512-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,1,1,3]
2592 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm16[0,2,2,3,4,6,6,7]
2593 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
2594 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5],ymm6[6,7]
2595 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7]
2596 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm6[4,5,6,7]
2597 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
2598 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
2599 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
2600 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
2601 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
2602 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
2603 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
2604 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2605 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
2606 ; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
2607 ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
2608 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm0[4,5,6,7]
2609 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,1,1,3,4,5,5,7]
2610 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
2611 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[0,1,1,3,4,5,5,7]
2612 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
2613 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7]
2614 ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[3,1,2,3,7,5,6,7]
2615 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm9[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
2616 ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm16[3,1,2,3,7,5,6,7]
2617 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
2618 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7]
2619 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7]
2620 ; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,2,2,2]
2621 ; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3]
2622 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2623 ; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
2624 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7]
2625 ; AVX512-NEXT: vpermt2d %xmm4, %xmm17, %xmm3
2626 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2627 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
2628 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
2629 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
2630 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
2631 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
2632 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
2633 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
2634 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
2635 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2636 ; AVX512-NEXT: vmovdqa64 %ymm28, (%rsi)
2637 ; AVX512-NEXT: vmovdqa64 %ymm29, (%rdx)
2638 ; AVX512-NEXT: vmovdqa64 %ymm19, (%rcx)
2639 ; AVX512-NEXT: vmovdqa %ymm8, (%r8)
2640 ; AVX512-NEXT: vmovdqa %ymm10, (%r9)
2641 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
2642 ; AVX512-NEXT: vmovdqa %ymm7, (%rax)
2643 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
2644 ; AVX512-NEXT: vmovdqa %ymm5, (%rax)
2645 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
2646 ; AVX512-NEXT: vmovdqa %ymm0, (%rax)
2647 ; AVX512-NEXT: vzeroupper
2650 ; AVX512-FCP-LABEL: load_i16_stride8_vf16:
2651 ; AVX512-FCP: # %bb.0:
2652 ; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
2653 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
2654 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2655 ; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm26
2656 ; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm27
2657 ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm0
2658 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
2659 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2660 ; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm28
2661 ; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm29
2662 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4]
2663 ; AVX512-FCP-NEXT: vmovdqa %xmm12, %xmm13
2664 ; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm13
2665 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
2666 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
2667 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm9
2668 ; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm10
2669 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
2670 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2671 ; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm30
2672 ; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm31
2673 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm11[0],xmm5[0],xmm11[1],xmm5[1]
2674 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm14[0,1],xmm13[2,3]
2675 ; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm16
2676 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2]
2677 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm18[0,1,2,0,4,5,6,4]
2678 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
2679 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17
2680 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm20 = ymm17[0,1,0,2]
2681 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,1,2,0,4,5,6,4]
2682 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
2683 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm14[7]
2684 ; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm19
2685 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm21 = ymm19[0,1,0,2]
2686 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23
2687 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm22 = ymm23[0,1,0,2]
2688 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm21[0,2,2,3,4,6,6,7]
2689 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
2690 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[0,2,2,3,4,6,6,7]
2691 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
2692 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7]
2693 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
2694 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7]
2695 ; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm24
2696 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0]
2697 ; AVX512-FCP-NEXT: vmovdqa %xmm11, %xmm1
2698 ; AVX512-FCP-NEXT: vpermt2d %xmm5, %xmm15, %xmm1
2699 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
2700 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
2701 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
2702 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
2703 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
2704 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
2705 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
2706 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
2707 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
2708 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2709 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm25
2710 ; AVX512-FCP-NEXT: vmovdqa %xmm12, %xmm0
2711 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm4[2],xmm12[3],xmm4[3]
2712 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [0,0,2,6]
2713 ; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm12, %xmm0
2714 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm5[2],xmm11[3],xmm5[3]
2715 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2716 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,1,1,3,4,5,5,7]
2717 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
2718 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[0,1,1,3,4,5,5,7]
2719 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
2720 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7]
2721 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm21[3,1,2,3,7,5,6,7]
2722 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
2723 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm22[3,1,2,3,7,5,6,7]
2724 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
2725 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7]
2726 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
2727 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2728 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm21
2729 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [3,7,0,0]
2730 ; AVX512-FCP-NEXT: vpermt2d %xmm5, %xmm18, %xmm11
2731 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm2[2,3]
2732 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
2733 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
2734 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
2735 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
2736 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
2737 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
2738 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
2739 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
2740 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22
2741 ; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm0
2742 ; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm1
2743 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2744 ; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm0
2745 ; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm1
2746 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2747 ; AVX512-FCP-NEXT: vpermi2d %xmm3, %xmm4, %xmm7
2748 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
2749 ; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm0
2750 ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm1
2751 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2752 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
2753 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3]
2754 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm20 = ymm16[0,1,1,3]
2755 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm20[0,1,2,0,4,5,6,4]
2756 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
2757 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm17[0,1,1,3]
2758 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,4]
2759 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
2760 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5,6],ymm9[7]
2761 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm19[0,1,1,3]
2762 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,2,2,3,4,6,6,7]
2763 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
2764 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm23[0,1,1,3]
2765 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[0,2,2,3,4,6,6,7]
2766 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
2767 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
2768 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7]
2769 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
2770 ; AVX512-FCP-NEXT: vpermi2d %xmm6, %xmm5, %xmm15
2771 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2772 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
2773 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
2774 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
2775 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm1[7]
2776 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
2777 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
2778 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
2779 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
2780 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2781 ; AVX512-FCP-NEXT: vpermi2d %xmm3, %xmm4, %xmm12
2782 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
2783 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3]
2784 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,1,1,3,4,5,5,7]
2785 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
2786 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm10[0,1,1,3,4,5,5,7]
2787 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
2788 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5,6],ymm8[7]
2789 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[3,1,2,3,7,5,6,7]
2790 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
2791 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm16[3,1,2,3,7,5,6,7]
2792 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
2793 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7]
2794 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7]
2795 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
2796 ; AVX512-FCP-NEXT: vpermt2d %xmm6, %xmm18, %xmm5
2797 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
2798 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
2799 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
2800 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
2801 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
2802 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
2803 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
2804 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
2805 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
2806 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
2807 ; AVX512-FCP-NEXT: vmovdqa64 %ymm24, (%rsi)
2808 ; AVX512-FCP-NEXT: vmovdqa64 %ymm25, (%rdx)
2809 ; AVX512-FCP-NEXT: vmovdqa64 %ymm21, (%rcx)
2810 ; AVX512-FCP-NEXT: vmovdqa64 %ymm22, (%r8)
2811 ; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r9)
2812 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2813 ; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rax)
2814 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2815 ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax)
2816 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2817 ; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rax)
2818 ; AVX512-FCP-NEXT: vzeroupper
2819 ; AVX512-FCP-NEXT: retq
2821 ; AVX512DQ-LABEL: load_i16_stride8_vf16:
2822 ; AVX512DQ: # %bb.0:
2823 ; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm0
2824 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm1
2825 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2826 ; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm24
2827 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm25
2828 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm0
2829 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm1
2830 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2831 ; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm26
2832 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm27
2833 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4]
2834 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm29 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
2835 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
2836 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
2837 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm30
2838 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
2839 ; AVX512DQ-NEXT: vpermt2d %xmm5, %xmm7, %xmm6
2840 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm9
2841 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm10
2842 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm12
2843 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm13
2844 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
2845 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
2846 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm14[0],xmm4[0],xmm14[1],xmm4[1]
2847 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
2848 ; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm18
2849 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm17 = ymm18[0,1,0,2]
2850 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[0,1,2,0,4,5,6,4]
2851 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm20
2852 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm19 = ymm20[0,1,0,2]
2853 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
2854 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[0,1,2,0,4,5,6,4]
2855 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
2856 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7]
2857 ; AVX512DQ-NEXT: vmovdqa64 160(%rdi), %ymm23
2858 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm21 = ymm23[0,1,0,2]
2859 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm16
2860 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm22 = ymm16[0,1,0,2]
2861 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm21[0,2,2,3,4,6,6,7]
2862 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
2863 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[0,2,2,3,4,6,6,7]
2864 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
2865 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5],ymm8[6,7]
2866 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
2867 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
2868 ; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm28
2869 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
2870 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
2871 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
2872 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
2873 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
2874 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
2875 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
2876 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,1,1]
2877 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
2878 ; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm2
2879 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
2880 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2881 ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm29
2882 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,1,1,3,4,5,5,7]
2883 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
2884 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,1,1,3,4,5,5,7]
2885 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
2886 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
2887 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[3,1,2,3,7,5,6,7]
2888 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
2889 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm22[3,1,2,3,7,5,6,7]
2890 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
2891 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7]
2892 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
2893 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm14[2],xmm4[2],xmm14[3],xmm4[3]
2894 ; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm6
2895 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
2896 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
2897 ; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm19
2898 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0]
2899 ; AVX512DQ-NEXT: vpermt2d %xmm4, %xmm17, %xmm14
2900 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm11[2,3]
2901 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
2902 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
2903 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
2904 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
2905 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
2906 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
2907 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
2908 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2909 ; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm0
2910 ; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm1
2911 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2912 ; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm0
2913 ; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm2
2914 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2915 ; AVX512DQ-NEXT: vpermi2d %xmm1, %xmm2, %xmm7
2916 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
2917 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
2918 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2919 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm7[2,3]
2920 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm18[0,1,1,3]
2921 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[0,1,2,0,4,5,6,4]
2922 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm10 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
2923 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm20[0,1,1,3]
2924 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm11[0,1,2,0,4,5,6,4]
2925 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
2926 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5,6],ymm10[7]
2927 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm23[0,1,1,3]
2928 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,2,2,3,4,6,6,7]
2929 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
2930 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,1,1,3]
2931 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm16[0,2,2,3,4,6,6,7]
2932 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
2933 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5],ymm6[6,7]
2934 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7]
2935 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm6[4,5,6,7]
2936 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
2937 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
2938 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
2939 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
2940 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
2941 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
2942 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
2943 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2944 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
2945 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
2946 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
2947 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm0[4,5,6,7]
2948 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,1,1,3,4,5,5,7]
2949 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
2950 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[0,1,1,3,4,5,5,7]
2951 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
2952 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7]
2953 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[3,1,2,3,7,5,6,7]
2954 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm9[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
2955 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm16[3,1,2,3,7,5,6,7]
2956 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
2957 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7]
2958 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7]
2959 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,2,2,2]
2960 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3]
2961 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2962 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
2963 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7]
2964 ; AVX512DQ-NEXT: vpermt2d %xmm4, %xmm17, %xmm3
2965 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2966 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
2967 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
2968 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
2969 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
2970 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
2971 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
2972 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
2973 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
2974 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2975 ; AVX512DQ-NEXT: vmovdqa64 %ymm28, (%rsi)
2976 ; AVX512DQ-NEXT: vmovdqa64 %ymm29, (%rdx)
2977 ; AVX512DQ-NEXT: vmovdqa64 %ymm19, (%rcx)
2978 ; AVX512DQ-NEXT: vmovdqa %ymm8, (%r8)
2979 ; AVX512DQ-NEXT: vmovdqa %ymm10, (%r9)
2980 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
2981 ; AVX512DQ-NEXT: vmovdqa %ymm7, (%rax)
2982 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
2983 ; AVX512DQ-NEXT: vmovdqa %ymm5, (%rax)
2984 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
2985 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax)
2986 ; AVX512DQ-NEXT: vzeroupper
2987 ; AVX512DQ-NEXT: retq
2989 ; AVX512DQ-FCP-LABEL: load_i16_stride8_vf16:
2990 ; AVX512DQ-FCP: # %bb.0:
2991 ; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
2992 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
2993 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2994 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm26
2995 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm27
2996 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm0
2997 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
2998 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2999 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm28
3000 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm29
3001 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4]
3002 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, %xmm13
3003 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm13
3004 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
3005 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
3006 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm9
3007 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm10
3008 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
3009 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3010 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm30
3011 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm31
3012 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm11[0],xmm5[0],xmm11[1],xmm5[1]
3013 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm14[0,1],xmm13[2,3]
3014 ; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm16
3015 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2]
3016 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm18[0,1,2,0,4,5,6,4]
3017 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
3018 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17
3019 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm20 = ymm17[0,1,0,2]
3020 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,1,2,0,4,5,6,4]
3021 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
3022 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm14[7]
3023 ; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm19
3024 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm21 = ymm19[0,1,0,2]
3025 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23
3026 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm22 = ymm23[0,1,0,2]
3027 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm21[0,2,2,3,4,6,6,7]
3028 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
3029 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[0,2,2,3,4,6,6,7]
3030 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
3031 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7]
3032 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
3033 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7]
3034 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm24
3035 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0]
3036 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, %xmm1
3037 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm5, %xmm15, %xmm1
3038 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
3039 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
3040 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
3041 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
3042 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
3043 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
3044 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
3045 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
3046 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
3047 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3048 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm25
3049 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, %xmm0
3050 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm4[2],xmm12[3],xmm4[3]
3051 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [0,0,2,6]
3052 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm12, %xmm0
3053 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm5[2],xmm11[3],xmm5[3]
3054 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
3055 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,1,1,3,4,5,5,7]
3056 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
3057 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[0,1,1,3,4,5,5,7]
3058 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
3059 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7]
3060 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm21[3,1,2,3,7,5,6,7]
3061 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
3062 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm22[3,1,2,3,7,5,6,7]
3063 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
3064 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7]
3065 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
3066 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3067 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm21
3068 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [3,7,0,0]
3069 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm5, %xmm18, %xmm11
3070 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm2[2,3]
3071 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
3072 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
3073 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
3074 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
3075 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
3076 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
3077 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
3078 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
3079 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm22
3080 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm0
3081 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm1
3082 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3083 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm0
3084 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm1
3085 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3086 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm3, %xmm4, %xmm7
3087 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
3088 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm0
3089 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm1
3090 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3091 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
3092 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3]
3093 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm20 = ymm16[0,1,1,3]
3094 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm20[0,1,2,0,4,5,6,4]
3095 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
3096 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm17[0,1,1,3]
3097 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,4]
3098 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
3099 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5,6],ymm9[7]
3100 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm19[0,1,1,3]
3101 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,2,2,3,4,6,6,7]
3102 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
3103 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm23[0,1,1,3]
3104 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[0,2,2,3,4,6,6,7]
3105 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
3106 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
3107 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7]
3108 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3109 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm6, %xmm5, %xmm15
3110 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
3111 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
3112 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
3113 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
3114 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm1[7]
3115 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
3116 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
3117 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
3118 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
3119 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3120 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm3, %xmm4, %xmm12
3121 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
3122 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3]
3123 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,1,1,3,4,5,5,7]
3124 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
3125 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm10[0,1,1,3,4,5,5,7]
3126 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
3127 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5,6],ymm8[7]
3128 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[3,1,2,3,7,5,6,7]
3129 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
3130 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm16[3,1,2,3,7,5,6,7]
3131 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
3132 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7]
3133 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7]
3134 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
3135 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm6, %xmm18, %xmm5
3136 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
3137 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
3138 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
3139 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
3140 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
3141 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
3142 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
3143 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
3144 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
3145 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
3146 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, (%rsi)
3147 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, (%rdx)
3148 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, (%rcx)
3149 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, (%r8)
3150 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r9)
3151 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3152 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rax)
3153 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3154 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax)
3155 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3156 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rax)
3157 ; AVX512DQ-FCP-NEXT: vzeroupper
3158 ; AVX512DQ-FCP-NEXT: retq
3160 ; AVX512BW-LABEL: load_i16_stride8_vf16:
3161 ; AVX512BW: # %bb.0:
3162 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
3163 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
3164 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
3165 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3166 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
3167 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2
3168 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3
3169 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
3170 ; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1]
3171 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4
3172 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56]
3173 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
3174 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
3175 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
3176 ; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1]
3177 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5
3178 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57]
3179 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
3180 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
3181 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
3182 ; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1]
3183 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
3184 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58]
3185 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
3186 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
3187 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
3188 ; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1]
3189 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7
3190 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59]
3191 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
3192 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
3193 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
3194 ; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1]
3195 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8
3196 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60]
3197 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9
3198 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
3199 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
3200 ; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1]
3201 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9
3202 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61]
3203 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
3204 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
3205 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
3206 ; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1]
3207 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10
3208 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62]
3209 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm11
3210 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
3211 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
3212 ; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1]
3213 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm11
3214 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63]
3215 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
3216 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
3217 ; AVX512BW-NEXT: vmovdqa %ymm4, (%rsi)
3218 ; AVX512BW-NEXT: vmovdqa %ymm5, (%rdx)
3219 ; AVX512BW-NEXT: vmovdqa %ymm6, (%rcx)
3220 ; AVX512BW-NEXT: vmovdqa %ymm7, (%r8)
3221 ; AVX512BW-NEXT: vmovdqa %ymm8, (%r9)
3222 ; AVX512BW-NEXT: vmovdqa %ymm9, (%r11)
3223 ; AVX512BW-NEXT: vmovdqa %ymm10, (%r10)
3224 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rax)
3225 ; AVX512BW-NEXT: vzeroupper
3226 ; AVX512BW-NEXT: retq
3228 ; AVX512BW-FCP-LABEL: load_i16_stride8_vf16:
3229 ; AVX512BW-FCP: # %bb.0:
3230 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3231 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
3232 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
3233 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
3234 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
3235 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
3236 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
3237 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
3238 ; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1]
3239 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4
3240 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56]
3241 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
3242 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
3243 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
3244 ; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1]
3245 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5
3246 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57]
3247 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
3248 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
3249 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
3250 ; AVX512BW-FCP-NEXT: # ymm6 = mem[0,1,0,1]
3251 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
3252 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58]
3253 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
3254 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
3255 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
3256 ; AVX512BW-FCP-NEXT: # ymm7 = mem[0,1,0,1]
3257 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7
3258 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59]
3259 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
3260 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
3261 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
3262 ; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1]
3263 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8
3264 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60]
3265 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9
3266 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
3267 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
3268 ; AVX512BW-FCP-NEXT: # ymm9 = mem[0,1,0,1]
3269 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9
3270 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61]
3271 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
3272 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
3273 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
3274 ; AVX512BW-FCP-NEXT: # ymm10 = mem[0,1,0,1]
3275 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10
3276 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62]
3277 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm11
3278 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
3279 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
3280 ; AVX512BW-FCP-NEXT: # ymm11 = mem[0,1,0,1]
3281 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm11
3282 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63]
3283 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
3284 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
3285 ; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rsi)
3286 ; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%rdx)
3287 ; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%rcx)
3288 ; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%r8)
3289 ; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%r9)
3290 ; AVX512BW-FCP-NEXT: vmovdqa %ymm9, (%r11)
3291 ; AVX512BW-FCP-NEXT: vmovdqa %ymm10, (%r10)
3292 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
3293 ; AVX512BW-FCP-NEXT: vzeroupper
3294 ; AVX512BW-FCP-NEXT: retq
3296 ; AVX512DQ-BW-LABEL: load_i16_stride8_vf16:
3297 ; AVX512DQ-BW: # %bb.0:
3298 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
3299 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
3300 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
3301 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
3302 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
3303 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2
3304 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3
3305 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
3306 ; AVX512DQ-BW-NEXT: # ymm4 = mem[0,1,0,1]
3307 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4
3308 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56]
3309 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
3310 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
3311 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
3312 ; AVX512DQ-BW-NEXT: # ymm5 = mem[0,1,0,1]
3313 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5
3314 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57]
3315 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
3316 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
3317 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
3318 ; AVX512DQ-BW-NEXT: # ymm6 = mem[0,1,0,1]
3319 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
3320 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58]
3321 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
3322 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
3323 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
3324 ; AVX512DQ-BW-NEXT: # ymm7 = mem[0,1,0,1]
3325 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7
3326 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59]
3327 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
3328 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
3329 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
3330 ; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,0,1]
3331 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8
3332 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60]
3333 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9
3334 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
3335 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
3336 ; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,0,1]
3337 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9
3338 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61]
3339 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
3340 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
3341 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
3342 ; AVX512DQ-BW-NEXT: # ymm10 = mem[0,1,0,1]
3343 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10
3344 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62]
3345 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm11
3346 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
3347 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
3348 ; AVX512DQ-BW-NEXT: # ymm11 = mem[0,1,0,1]
3349 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm11
3350 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63]
3351 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
3352 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
3353 ; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rsi)
3354 ; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%rdx)
3355 ; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%rcx)
3356 ; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%r8)
3357 ; AVX512DQ-BW-NEXT: vmovdqa %ymm8, (%r9)
3358 ; AVX512DQ-BW-NEXT: vmovdqa %ymm9, (%r11)
3359 ; AVX512DQ-BW-NEXT: vmovdqa %ymm10, (%r10)
3360 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax)
3361 ; AVX512DQ-BW-NEXT: vzeroupper
3362 ; AVX512DQ-BW-NEXT: retq
3364 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride8_vf16:
3365 ; AVX512DQ-BW-FCP: # %bb.0:
3366 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3367 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
3368 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
3369 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
3370 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
3371 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
3372 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
3373 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
3374 ; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1]
3375 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4
3376 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56]
3377 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
3378 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
3379 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
3380 ; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1]
3381 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5
3382 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57]
3383 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
3384 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
3385 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
3386 ; AVX512DQ-BW-FCP-NEXT: # ymm6 = mem[0,1,0,1]
3387 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
3388 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58]
3389 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
3390 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
3391 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
3392 ; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[0,1,0,1]
3393 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7
3394 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59]
3395 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
3396 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
3397 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
3398 ; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1]
3399 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8
3400 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60]
3401 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9
3402 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
3403 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
3404 ; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[0,1,0,1]
3405 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9
3406 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61]
3407 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
3408 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
3409 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
3410 ; AVX512DQ-BW-FCP-NEXT: # ymm10 = mem[0,1,0,1]
3411 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10
3412 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62]
3413 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm11
3414 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
3415 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
3416 ; AVX512DQ-BW-FCP-NEXT: # ymm11 = mem[0,1,0,1]
3417 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm11
3418 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63]
3419 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
3420 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
3421 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rsi)
3422 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%rdx)
3423 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%rcx)
3424 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%r8)
3425 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%r9)
3426 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm9, (%r11)
3427 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm10, (%r10)
3428 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
3429 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
3430 ; AVX512DQ-BW-FCP-NEXT: retq
3431 %wide.vec = load <128 x i16>, ptr %in.vec, align 64
3432 %strided.vec0 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120>
3433 %strided.vec1 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <16 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121>
3434 %strided.vec2 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <16 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 66, i32 74, i32 82, i32 90, i32 98, i32 106, i32 114, i32 122>
3435 %strided.vec3 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <16 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 67, i32 75, i32 83, i32 91, i32 99, i32 107, i32 115, i32 123>
3436 %strided.vec4 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <16 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 68, i32 76, i32 84, i32 92, i32 100, i32 108, i32 116, i32 124>
3437 %strided.vec5 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <16 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125>
3438 %strided.vec6 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <16 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126>
3439 %strided.vec7 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <16 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127>
3440 store <16 x i16> %strided.vec0, ptr %out.vec0, align 64
3441 store <16 x i16> %strided.vec1, ptr %out.vec1, align 64
3442 store <16 x i16> %strided.vec2, ptr %out.vec2, align 64
3443 store <16 x i16> %strided.vec3, ptr %out.vec3, align 64
3444 store <16 x i16> %strided.vec4, ptr %out.vec4, align 64
3445 store <16 x i16> %strided.vec5, ptr %out.vec5, align 64
3446 store <16 x i16> %strided.vec6, ptr %out.vec6, align 64
3447 store <16 x i16> %strided.vec7, ptr %out.vec7, align 64
3451 define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
3452 ; SSE-LABEL: load_i16_stride8_vf32:
3454 ; SSE-NEXT: subq $696, %rsp # imm = 0x2B8
3455 ; SSE-NEXT: movdqa 496(%rdi), %xmm2
3456 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3457 ; SSE-NEXT: movdqa 480(%rdi), %xmm14
3458 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3459 ; SSE-NEXT: movdqa 208(%rdi), %xmm1
3460 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3461 ; SSE-NEXT: movdqa 192(%rdi), %xmm3
3462 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3463 ; SSE-NEXT: movdqa 240(%rdi), %xmm5
3464 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3465 ; SSE-NEXT: movdqa 224(%rdi), %xmm15
3466 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3467 ; SSE-NEXT: movdqa 144(%rdi), %xmm6
3468 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3469 ; SSE-NEXT: movdqa 128(%rdi), %xmm4
3470 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3471 ; SSE-NEXT: movdqa 176(%rdi), %xmm7
3472 ; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill
3473 ; SSE-NEXT: movdqa 160(%rdi), %xmm0
3474 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3475 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
3476 ; SSE-NEXT: movdqa %xmm0, %xmm7
3477 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3478 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
3479 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3480 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3]
3481 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3482 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,0,0]
3483 ; SSE-NEXT: movdqa %xmm3, %xmm15
3484 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
3485 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,0,0]
3486 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3487 ; SSE-NEXT: movdqa %xmm4, %xmm0
3488 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
3489 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3490 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3491 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3]
3492 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3493 ; SSE-NEXT: movdqa 464(%rdi), %xmm0
3494 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3495 ; SSE-NEXT: movdqa 448(%rdi), %xmm8
3496 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3497 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
3498 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0]
3499 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,0,0]
3500 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3501 ; SSE-NEXT: movdqa 432(%rdi), %xmm0
3502 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3503 ; SSE-NEXT: movdqa 416(%rdi), %xmm11
3504 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3505 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
3506 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3507 ; SSE-NEXT: movdqa 400(%rdi), %xmm0
3508 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3509 ; SSE-NEXT: movdqa 384(%rdi), %xmm10
3510 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3511 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
3512 ; SSE-NEXT: movdqa %xmm10, %xmm0
3513 ; SSE-NEXT: movdqa %xmm10, %xmm14
3514 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3515 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
3516 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3517 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3518 ; SSE-NEXT: movdqa 368(%rdi), %xmm0
3519 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3520 ; SSE-NEXT: movdqa 352(%rdi), %xmm9
3521 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3522 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
3523 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3524 ; SSE-NEXT: movdqa 336(%rdi), %xmm0
3525 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3526 ; SSE-NEXT: movdqa 320(%rdi), %xmm12
3527 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3528 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
3529 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0]
3530 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0]
3531 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3532 ; SSE-NEXT: movdqa 304(%rdi), %xmm0
3533 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3534 ; SSE-NEXT: movdqa 288(%rdi), %xmm11
3535 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3536 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
3537 ; SSE-NEXT: movdqa 272(%rdi), %xmm0
3538 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3539 ; SSE-NEXT: movdqa 256(%rdi), %xmm10
3540 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3541 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
3542 ; SSE-NEXT: movdqa %xmm10, %xmm0
3543 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
3544 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3545 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3546 ; SSE-NEXT: movdqa 112(%rdi), %xmm0
3547 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3548 ; SSE-NEXT: movdqa 96(%rdi), %xmm9
3549 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3550 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
3551 ; SSE-NEXT: movdqa 80(%rdi), %xmm0
3552 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3553 ; SSE-NEXT: movdqa 64(%rdi), %xmm13
3554 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3555 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
3556 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,0,0]
3557 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,0,0,0]
3558 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
3559 ; SSE-NEXT: movdqa 32(%rdi), %xmm7
3560 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3561 ; SSE-NEXT: movdqa 48(%rdi), %xmm0
3562 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3563 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
3564 ; SSE-NEXT: movdqa (%rdi), %xmm6
3565 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3566 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
3567 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3568 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
3569 ; SSE-NEXT: movdqa %xmm6, %xmm1
3570 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
3571 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
3572 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3573 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3574 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1]
3575 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3576 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1]
3577 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3578 ; SSE-NEXT: movdqa %xmm15, %xmm0
3579 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3580 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3581 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
3582 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3583 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1]
3584 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3585 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
3586 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3587 ; SSE-NEXT: movdqa %xmm8, %xmm0
3588 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3589 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3590 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
3591 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3592 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1]
3593 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1]
3594 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3595 ; SSE-NEXT: movdqa %xmm12, %xmm0
3596 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3597 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1]
3598 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
3599 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3600 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1]
3601 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
3602 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3603 ; SSE-NEXT: movdqa %xmm13, %xmm0
3604 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
3605 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
3606 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3607 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
3608 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,2,2]
3609 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3610 ; SSE-NEXT: movdqa %xmm4, %xmm0
3611 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
3612 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3613 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3614 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2]
3615 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,2,2]
3616 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3617 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3618 ; SSE-NEXT: movapd %xmm3, %xmm0
3619 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3620 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3621 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3622 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3623 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
3624 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,2,2]
3625 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3626 ; SSE-NEXT: movdqa %xmm10, %xmm0
3627 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3]
3628 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3629 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3630 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,2,2]
3631 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2]
3632 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3633 ; SSE-NEXT: movdqa %xmm6, %xmm1
3634 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3]
3635 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
3636 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3637 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3638 ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
3639 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3]
3640 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3]
3641 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3642 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
3643 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3644 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
3645 ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3]
3646 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3]
3647 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3]
3648 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3649 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3]
3650 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3651 ; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm14[2],xmm12[3],xmm14[3]
3652 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3]
3653 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3]
3654 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3655 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3]
3656 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3657 ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm9[2],xmm13[3],xmm9[3]
3658 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3]
3659 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3]
3660 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3661 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3]
3662 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3663 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3664 ; SSE-NEXT: punpckhwd (%rsp), %xmm3 # 16-byte Folded Reload
3665 ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
3666 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3667 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3668 ; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7]
3669 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3670 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3671 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
3672 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3673 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3674 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
3675 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
3676 ; SSE-NEXT: movdqa %xmm1, %xmm12
3677 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
3678 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3679 ; SSE-NEXT: movdqa %xmm15, %xmm0
3680 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3681 ; SSE-NEXT: movdqa %xmm3, %xmm7
3682 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3683 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3684 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3685 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3686 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3687 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
3688 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3689 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3690 ; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
3691 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3692 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
3693 ; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7]
3694 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3695 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3696 ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
3697 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3698 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,0,0]
3699 ; SSE-NEXT: movdqa %xmm8, %xmm11
3700 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,0,0]
3701 ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
3702 ; SSE-NEXT: movdqa %xmm4, %xmm0
3703 ; SSE-NEXT: movdqa %xmm4, %xmm14
3704 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3705 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3706 ; SSE-NEXT: movdqa %xmm1, %xmm13
3707 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3708 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
3709 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3710 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3711 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
3712 ; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7]
3713 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3714 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
3715 ; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7]
3716 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3717 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3718 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
3719 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3720 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3721 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3722 ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
3723 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3724 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3725 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,0,0]
3726 ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
3727 ; SSE-NEXT: movdqa %xmm9, %xmm0
3728 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
3729 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
3730 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3731 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3732 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3733 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
3734 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3735 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3736 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
3737 ; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7]
3738 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3739 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,0,0]
3740 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3741 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3742 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3743 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3744 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
3745 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3746 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3747 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3748 ; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
3749 ; SSE-NEXT: movdqa %xmm4, %xmm3
3750 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
3751 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
3752 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3753 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1]
3754 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
3755 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3756 ; SSE-NEXT: movdqa %xmm2, %xmm0
3757 ; SSE-NEXT: movdqa %xmm2, %xmm5
3758 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3759 ; SSE-NEXT: movdqa %xmm12, %xmm3
3760 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3761 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
3762 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
3763 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3764 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,1,1]
3765 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1]
3766 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3767 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3768 ; SSE-NEXT: movdqa %xmm1, %xmm0
3769 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
3770 ; SSE-NEXT: movdqa %xmm11, %xmm6
3771 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3772 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
3773 ; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill
3774 ; SSE-NEXT: movdqa %xmm9, %xmm14
3775 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3776 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,1,1]
3777 ; SSE-NEXT: movdqa %xmm10, %xmm13
3778 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3779 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
3780 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
3781 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3782 ; SSE-NEXT: movaps %xmm7, %xmm0
3783 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3784 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
3785 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,3]
3786 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,1,1]
3787 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3788 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
3789 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
3790 ; SSE-NEXT: movdqa %xmm8, %xmm0
3791 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3792 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
3793 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,3]
3794 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
3795 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,2,2,2]
3796 ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3]
3797 ; SSE-NEXT: movdqa %xmm15, %xmm0
3798 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3799 ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
3800 ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1]
3801 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2]
3802 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[2,2,2,2]
3803 ; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3]
3804 ; SSE-NEXT: movdqa %xmm14, %xmm0
3805 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3]
3806 ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1]
3807 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2]
3808 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[2,2,2,2]
3809 ; SSE-NEXT: movdqa %xmm1, %xmm3
3810 ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3]
3811 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3812 ; SSE-NEXT: movapd %xmm6, %xmm0
3813 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3814 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
3815 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1]
3816 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2]
3817 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3818 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2]
3819 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3820 ; SSE-NEXT: movdqa %xmm4, %xmm14
3821 ; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm2[2],xmm14[3],xmm2[3]
3822 ; SSE-NEXT: movdqa %xmm2, %xmm8
3823 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2,3]
3824 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3825 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3826 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
3827 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[3,3,3,3]
3828 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3829 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
3830 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
3831 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm1[2,3]
3832 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3833 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3834 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
3835 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3836 ; SSE-NEXT: # xmm2 = mem[3,3,3,3]
3837 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3838 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
3839 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3840 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
3841 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3842 ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
3843 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3]
3844 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3]
3845 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3846 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
3847 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
3848 ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
3849 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3]
3850 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[3,3,3,3]
3851 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
3852 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
3853 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3854 ; SSE-NEXT: movaps %xmm3, 32(%rsi)
3855 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3856 ; SSE-NEXT: movaps %xmm3, 48(%rsi)
3857 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3858 ; SSE-NEXT: movaps %xmm3, (%rsi)
3859 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3860 ; SSE-NEXT: movaps %xmm8, 16(%rsi)
3861 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3862 ; SSE-NEXT: movaps %xmm3, 32(%rdx)
3863 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3864 ; SSE-NEXT: movaps %xmm3, 48(%rdx)
3865 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3866 ; SSE-NEXT: movaps %xmm3, (%rdx)
3867 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3868 ; SSE-NEXT: movaps %xmm3, 16(%rdx)
3869 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3870 ; SSE-NEXT: movaps %xmm3, 32(%rcx)
3871 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3872 ; SSE-NEXT: movaps %xmm3, 48(%rcx)
3873 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3874 ; SSE-NEXT: movaps %xmm3, (%rcx)
3875 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3876 ; SSE-NEXT: movaps %xmm3, 16(%rcx)
3877 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3878 ; SSE-NEXT: movaps %xmm3, 32(%r8)
3879 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3880 ; SSE-NEXT: movaps %xmm3, 48(%r8)
3881 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3882 ; SSE-NEXT: movaps %xmm3, (%r8)
3883 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3884 ; SSE-NEXT: movaps %xmm3, 16(%r8)
3885 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3886 ; SSE-NEXT: movaps %xmm3, 32(%r9)
3887 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3888 ; SSE-NEXT: movaps %xmm3, 48(%r9)
3889 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3890 ; SSE-NEXT: movaps %xmm3, (%r9)
3891 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3892 ; SSE-NEXT: movaps %xmm3, 16(%r9)
3893 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
3894 ; SSE-NEXT: movaps %xmm9, 32(%rax)
3895 ; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload
3896 ; SSE-NEXT: movaps %xmm3, 48(%rax)
3897 ; SSE-NEXT: movaps %xmm10, (%rax)
3898 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3899 ; SSE-NEXT: movaps %xmm3, 16(%rax)
3900 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
3901 ; SSE-NEXT: movapd %xmm13, 48(%rax)
3902 ; SSE-NEXT: movapd %xmm12, 32(%rax)
3903 ; SSE-NEXT: movapd %xmm11, 16(%rax)
3904 ; SSE-NEXT: movaps %xmm14, (%rax)
3905 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
3906 ; SSE-NEXT: movaps %xmm1, 48(%rax)
3907 ; SSE-NEXT: movaps %xmm2, 32(%rax)
3908 ; SSE-NEXT: movaps %xmm15, 16(%rax)
3909 ; SSE-NEXT: movaps %xmm0, (%rax)
3910 ; SSE-NEXT: addq $696, %rsp # imm = 0x2B8
3913 ; AVX-LABEL: load_i16_stride8_vf32:
3915 ; AVX-NEXT: subq $872, %rsp # imm = 0x368
3916 ; AVX-NEXT: vmovdqa 304(%rdi), %xmm0
3917 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3918 ; AVX-NEXT: vmovdqa 288(%rdi), %xmm1
3919 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3920 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3921 ; AVX-NEXT: vmovdqa 272(%rdi), %xmm0
3922 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3923 ; AVX-NEXT: vmovdqa 256(%rdi), %xmm1
3924 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3925 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3926 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
3927 ; AVX-NEXT: vmovdqa 368(%rdi), %xmm1
3928 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3929 ; AVX-NEXT: vmovdqa 352(%rdi), %xmm2
3930 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3931 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3932 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,0,0,0]
3933 ; AVX-NEXT: vmovdqa 336(%rdi), %xmm2
3934 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3935 ; AVX-NEXT: vmovdqa 320(%rdi), %xmm3
3936 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3937 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
3938 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,1,0,1]
3939 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
3940 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
3941 ; AVX-NEXT: vmovdqa 496(%rdi), %xmm1
3942 ; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
3943 ; AVX-NEXT: vmovdqa 480(%rdi), %xmm2
3944 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3945 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3946 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,0,0,0]
3947 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3948 ; AVX-NEXT: vmovdqa 464(%rdi), %xmm2
3949 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3950 ; AVX-NEXT: vmovdqa 448(%rdi), %xmm3
3951 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3952 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
3953 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,0,1]
3954 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
3955 ; AVX-NEXT: vmovdqa 432(%rdi), %xmm2
3956 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3957 ; AVX-NEXT: vmovdqa 416(%rdi), %xmm3
3958 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3959 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
3960 ; AVX-NEXT: vmovdqa 400(%rdi), %xmm2
3961 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3962 ; AVX-NEXT: vmovdqa 384(%rdi), %xmm3
3963 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3964 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
3965 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm11[0],xmm13[0],xmm11[1],xmm13[1]
3966 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
3967 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
3968 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
3969 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3970 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3971 ; AVX-NEXT: vmovdqa 240(%rdi), %xmm0
3972 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3973 ; AVX-NEXT: vmovdqa 224(%rdi), %xmm1
3974 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3975 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3976 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3977 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3978 ; AVX-NEXT: vmovdqa 208(%rdi), %xmm1
3979 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3980 ; AVX-NEXT: vmovdqa 192(%rdi), %xmm2
3981 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3982 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3983 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3984 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
3985 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
3986 ; AVX-NEXT: vmovdqa 176(%rdi), %xmm1
3987 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3988 ; AVX-NEXT: vmovdqa 160(%rdi), %xmm2
3989 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3990 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3991 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3992 ; AVX-NEXT: vmovdqa 144(%rdi), %xmm1
3993 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3994 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm2
3995 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3996 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3997 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3998 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
3999 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
4000 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4001 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4002 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm0
4003 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4004 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm1
4005 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4006 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4007 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4008 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
4009 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm1
4010 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4011 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm2
4012 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4013 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
4014 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4015 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
4016 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
4017 ; AVX-NEXT: vmovdqa (%rdi), %xmm15
4018 ; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4019 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
4020 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4021 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2
4022 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4023 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm4
4024 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4025 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
4026 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
4027 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm9[0],xmm12[0],xmm9[1],xmm12[1]
4028 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4029 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4,5,6,7]
4030 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4031 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4032 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4033 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
4034 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6,7]
4035 ; AVX-NEXT: vmovdqa %xmm6, %xmm4
4036 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4037 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm6[0],xmm14[1],xmm6[1]
4038 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
4039 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm10[0],xmm7[0],xmm10[1],xmm7[1]
4040 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4041 ; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4042 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[1,1,1,1]
4043 ; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1],xmm13[2,3],xmm15[4,5,6,7]
4044 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
4045 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7]
4046 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4047 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4048 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4049 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4050 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
4051 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4052 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4053 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm2[1,1,1,1]
4054 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4055 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
4056 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4057 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
4058 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,1,1]
4059 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm12[2,3],xmm3[4,5,6,7]
4060 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4061 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4062 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1]
4063 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm15[4,5,6,7]
4064 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
4065 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4066 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,2,2]
4067 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5],xmm0[6,7]
4068 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm5[2],xmm8[3],xmm5[3]
4069 ; AVX-NEXT: vmovdqa %xmm5, %xmm4
4070 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
4071 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm11[2],xmm13[2],xmm11[3],xmm13[3]
4072 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4073 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4074 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[2,2,2,2]
4075 ; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0,1,2,3,4,5],xmm15[6,7]
4076 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
4077 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7]
4078 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4079 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4080 ; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
4081 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4082 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,2,2,2]
4083 ; AVX-NEXT: vmovaps %xmm6, %xmm5
4084 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3]
4085 ; AVX-NEXT: vmovaps %xmm7, %xmm6
4086 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4087 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
4088 ; AVX-NEXT: vmovdqa %xmm9, %xmm7
4089 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,2,2,2]
4090 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4091 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5],xmm3[6,7]
4092 ; AVX-NEXT: vmovdqa %xmm12, %xmm9
4093 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4094 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm12[2],xmm9[2],xmm12[3],xmm9[3]
4095 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm3[4,5,6,7]
4096 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
4097 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4098 ; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm0 # 16-byte Folded Reload
4099 ; AVX-NEXT: # xmm0 = xmm14[2],mem[2],xmm14[3],mem[3]
4100 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3]
4101 ; AVX-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
4102 ; AVX-NEXT: # xmm14 = mem[3,3,3,3]
4103 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3],xmm14[4,5,6,7]
4104 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
4105 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm10[2],xmm11[2],xmm10[3],xmm11[3]
4106 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[2,3,2,3]
4107 ; AVX-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
4108 ; AVX-NEXT: # xmm10 = mem[3,3,3,3]
4109 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7]
4110 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4111 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
4112 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7]
4113 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4114 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4115 ; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
4116 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,3,2,3]
4117 ; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm2[3,3,3,3]
4118 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3]
4119 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4120 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4121 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
4122 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
4123 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[2,3,2,3]
4124 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[3,3,3,3]
4125 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
4126 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
4127 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4128 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4129 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4130 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
4131 ; AVX-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4132 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4133 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
4134 ; AVX-NEXT: # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4135 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4136 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
4137 ; AVX-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4138 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4139 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
4140 ; AVX-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4141 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,0,0]
4142 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1]
4143 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4144 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
4145 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm6[0],xmm10[1],xmm6[1]
4146 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
4147 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4148 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
4149 ; AVX-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4150 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4151 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4152 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
4153 ; AVX-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4154 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4155 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4156 ; AVX-NEXT: vpunpckhwd (%rsp), %xmm1, %xmm2 # 16-byte Folded Reload
4157 ; AVX-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4158 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4159 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4160 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
4161 ; AVX-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4162 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4163 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
4164 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1]
4165 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
4166 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
4167 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
4168 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
4169 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
4170 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4171 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4172 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4173 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
4174 ; AVX-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4175 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4176 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4177 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
4178 ; AVX-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4179 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4180 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
4181 ; AVX-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4182 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4183 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
4184 ; AVX-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4185 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[0,0,0,0]
4186 ; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4187 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,1,0,1]
4188 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4189 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
4190 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm2[0],xmm11[1],xmm2[1]
4191 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
4192 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4193 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4194 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4195 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
4196 ; AVX-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4197 ; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
4198 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4199 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4200 ; AVX-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4201 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4202 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
4203 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
4204 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm1[6,7]
4205 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4206 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload
4207 ; AVX-NEXT: # xmm13 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4208 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4209 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload
4210 ; AVX-NEXT: # xmm14 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4211 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
4212 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4,5,6,7]
4213 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4214 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4215 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
4216 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4217 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5,6,7]
4218 ; AVX-NEXT: vmovdqa %xmm8, %xmm5
4219 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4220 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
4221 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
4222 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4223 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4224 ; AVX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
4225 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4226 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4227 ; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm4[1,1,1,1]
4228 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4229 ; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3]
4230 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
4231 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7]
4232 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4233 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4234 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm12[0],xmm9[1],xmm12[1]
4235 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4236 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,1,1]
4237 ; AVX-NEXT: vmovdqa %xmm11, %xmm9
4238 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4239 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm11[2,3],xmm3[4,5,6,7]
4240 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4241 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
4242 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[1,1,1,1]
4243 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm13[2,3],xmm3[4,5,6,7]
4244 ; AVX-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload
4245 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4246 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm12[0],xmm2[0],xmm12[1],xmm2[1]
4247 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm15[4,5,6,7]
4248 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
4249 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4250 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
4251 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4252 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5],xmm0[6,7]
4253 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm10[2],xmm6[2],xmm10[3],xmm6[3]
4254 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
4255 ; AVX-NEXT: vunpckhps {{.*#+}} xmm3 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
4256 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4257 ; AVX-NEXT: vmovaps %xmm8, %xmm6
4258 ; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm8[2,2,2,2]
4259 ; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm7[0,1,2],xmm15[3]
4260 ; AVX-NEXT: vmovaps %xmm7, %xmm8
4261 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
4262 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7]
4263 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4264 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4265 ; AVX-NEXT: vmovdqa %xmm9, %xmm4
4266 ; AVX-NEXT: vmovdqa %xmm11, %xmm1
4267 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
4268 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4269 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4270 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm9[2,2,2,2]
4271 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4272 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3]
4273 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4274 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
4275 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,2,2]
4276 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5],xmm3[6,7]
4277 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
4278 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm3[4,5,6,7]
4279 ; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0,1,2,3],ymm0[4,5,6,7]
4280 ; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload
4281 ; AVX-NEXT: # xmm0 = xmm5[2],mem[2],xmm5[3],mem[3]
4282 ; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4283 ; AVX-NEXT: # xmm3 = mem[2,3,2,3]
4284 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,3,3,3]
4285 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3],xmm7[4,5,6,7]
4286 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
4287 ; AVX-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm6[2],xmm8[3],xmm6[3]
4288 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4289 ; AVX-NEXT: # xmm5 = mem[2,3,2,3]
4290 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
4291 ; AVX-NEXT: # xmm6 = mem[3,3,3,3]
4292 ; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
4293 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4294 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
4295 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
4296 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4297 ; AVX-NEXT: vunpckhps {{.*#+}} xmm3 = xmm11[2],xmm9[2],xmm11[3],xmm9[3]
4298 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
4299 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,3,3,3]
4300 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
4301 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4302 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
4303 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
4304 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm12[2],xmm2[2],xmm12[3],xmm2[3]
4305 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[2,3,2,3]
4306 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[3,3,3,3]
4307 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
4308 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
4309 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
4310 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4311 ; AVX-NEXT: vmovaps %ymm2, (%rsi)
4312 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4313 ; AVX-NEXT: vmovaps %ymm2, 32(%rsi)
4314 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4315 ; AVX-NEXT: vmovaps %ymm2, (%rdx)
4316 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4317 ; AVX-NEXT: vmovaps %ymm2, 32(%rdx)
4318 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4319 ; AVX-NEXT: vmovaps %ymm2, (%rcx)
4320 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4321 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx)
4322 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4323 ; AVX-NEXT: vmovaps %ymm2, (%r8)
4324 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4325 ; AVX-NEXT: vmovaps %ymm2, 32(%r8)
4326 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4327 ; AVX-NEXT: vmovaps %ymm2, (%r9)
4328 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4329 ; AVX-NEXT: vmovaps %ymm2, 32(%r9)
4330 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
4331 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4332 ; AVX-NEXT: vmovaps %ymm2, (%rax)
4333 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4334 ; AVX-NEXT: vmovaps %ymm2, 32(%rax)
4335 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
4336 ; AVX-NEXT: vmovaps %ymm15, (%rax)
4337 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4338 ; AVX-NEXT: vmovaps %ymm2, 32(%rax)
4339 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
4340 ; AVX-NEXT: vmovaps %ymm1, (%rax)
4341 ; AVX-NEXT: vmovaps %ymm0, 32(%rax)
4342 ; AVX-NEXT: addq $872, %rsp # imm = 0x368
4343 ; AVX-NEXT: vzeroupper
4346 ; AVX2-LABEL: load_i16_stride8_vf32:
4348 ; AVX2-NEXT: subq $1000, %rsp # imm = 0x3E8
4349 ; AVX2-NEXT: vmovdqa 448(%rdi), %ymm2
4350 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4351 ; AVX2-NEXT: vmovdqa 480(%rdi), %ymm3
4352 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4353 ; AVX2-NEXT: vmovdqa 272(%rdi), %xmm0
4354 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4355 ; AVX2-NEXT: vmovdqa 256(%rdi), %xmm1
4356 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4357 ; AVX2-NEXT: vmovdqa 304(%rdi), %xmm4
4358 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4359 ; AVX2-NEXT: vmovdqa 288(%rdi), %xmm5
4360 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4361 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
4362 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4363 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4364 ; AVX2-NEXT: vmovdqa 368(%rdi), %xmm0
4365 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4366 ; AVX2-NEXT: vmovdqa 352(%rdi), %xmm1
4367 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4368 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4369 ; AVX2-NEXT: vpbroadcastd %xmm9, %xmm0
4370 ; AVX2-NEXT: vmovdqa 336(%rdi), %xmm1
4371 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4372 ; AVX2-NEXT: vmovdqa 320(%rdi), %xmm4
4373 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4374 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
4375 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4376 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
4377 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4378 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
4379 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
4380 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,0,2]
4381 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4382 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
4383 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4384 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
4385 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
4386 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4387 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
4388 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4389 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
4390 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
4391 ; AVX2-NEXT: vmovdqa 384(%rdi), %ymm2
4392 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4393 ; AVX2-NEXT: vmovdqa 416(%rdi), %ymm3
4394 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4395 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2]
4396 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4397 ; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[0,2,2,3,4,6,6,7]
4398 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
4399 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
4400 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4401 ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7]
4402 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
4403 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
4404 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
4405 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4406 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4407 ; AVX2-NEXT: vmovdqa 112(%rdi), %xmm0
4408 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4409 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm1
4410 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4411 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4412 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4413 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
4414 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm1
4415 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4416 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2
4417 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4418 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
4419 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4420 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
4421 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4422 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
4423 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4424 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2
4425 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4426 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm3
4427 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4428 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm4
4429 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4430 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
4431 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
4432 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm15[0],xmm10[1],xmm15[1]
4433 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3]
4434 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm0
4435 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4436 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm1
4437 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4438 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
4439 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4440 ; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,2,0,4,5,6,4]
4441 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
4442 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
4443 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4444 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4]
4445 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm14 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
4446 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm11[7]
4447 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm2
4448 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4449 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm0
4450 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4451 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
4452 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4453 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
4454 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
4455 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
4456 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4457 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
4458 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
4459 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7]
4460 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
4461 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
4462 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4463 ; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4464 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
4465 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4466 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3]
4467 ; AVX2-NEXT: vmovdqa %xmm9, %xmm14
4468 ; AVX2-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill
4469 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4470 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm14[0],xmm9[1],xmm14[1]
4471 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
4472 ; AVX2-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
4473 ; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
4474 ; AVX2-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
4475 ; AVX2-NEXT: # ymm6 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
4476 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7]
4477 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm6 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
4478 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
4479 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
4480 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
4481 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4482 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4483 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
4484 ; AVX2-NEXT: vmovdqa %xmm10, %xmm11
4485 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3]
4486 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4487 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4488 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
4489 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
4490 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
4491 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
4492 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
4493 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
4494 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
4495 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
4496 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
4497 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4498 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4499 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
4500 ; AVX2-NEXT: vmovdqa %xmm9, %xmm3
4501 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
4502 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm8[2],xmm5[2],xmm8[3],xmm5[3]
4503 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3]
4504 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
4505 ; AVX2-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7]
4506 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
4507 ; AVX2-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
4508 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
4509 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
4510 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,6],ymm1[7]
4511 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
4512 ; AVX2-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7]
4513 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
4514 ; AVX2-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
4515 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
4516 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
4517 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
4518 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
4519 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
4520 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4521 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[2,2,2,2]
4522 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1,2],xmm4[3]
4523 ; AVX2-NEXT: vmovdqa %xmm11, %xmm12
4524 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm15[2],xmm11[3],xmm15[3]
4525 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
4526 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
4527 ; AVX2-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7]
4528 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
4529 ; AVX2-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
4530 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
4531 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm6 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
4532 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
4533 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
4534 ; AVX2-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7]
4535 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
4536 ; AVX2-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
4537 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm13 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
4538 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm14 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
4539 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
4540 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7]
4541 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
4542 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4543 ; AVX2-NEXT: vpunpckhdq (%rsp), %xmm3, %xmm4 # 16-byte Folded Reload
4544 ; AVX2-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3]
4545 ; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4546 ; AVX2-NEXT: # xmm5 = mem[2,3,2,3]
4547 ; AVX2-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
4548 ; AVX2-NEXT: # xmm13 = mem[3,3,3,3]
4549 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0],xmm5[1],xmm13[2,3]
4550 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
4551 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
4552 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
4553 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
4554 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
4555 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
4556 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
4557 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4558 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
4559 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4560 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload
4561 ; AVX2-NEXT: # xmm0 = xmm10[2],mem[2],xmm10[3],mem[3]
4562 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[2,3,2,3]
4563 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,3,3,3]
4564 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
4565 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
4566 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
4567 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
4568 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
4569 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
4570 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
4571 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
4572 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
4573 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4574 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4575 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4576 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
4577 ; AVX2-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4578 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4579 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4580 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
4581 ; AVX2-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4582 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4583 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4584 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
4585 ; AVX2-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4586 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4587 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4588 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
4589 ; AVX2-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4590 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4591 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm0
4592 ; AVX2-NEXT: vpbroadcastd %xmm4, %xmm1
4593 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4594 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4595 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
4596 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
4597 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3]
4598 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4599 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
4600 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3]
4601 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4602 ; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,2,0,4,5,6,4]
4603 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
4604 ; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4]
4605 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
4606 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
4607 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
4608 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3]
4609 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4610 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
4611 ; AVX2-NEXT: # ymm3 = mem[0,1,1,3]
4612 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4613 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,2,3,4,6,6,7]
4614 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm6 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
4615 ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,2,3,4,6,6,7]
4616 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
4617 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
4618 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
4619 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4620 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4621 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4622 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
4623 ; AVX2-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4624 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4625 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4626 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
4627 ; AVX2-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4628 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm0
4629 ; AVX2-NEXT: vpbroadcastd %xmm11, %xmm1
4630 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4631 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4632 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
4633 ; AVX2-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4634 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4635 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4636 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload
4637 ; AVX2-NEXT: # xmm14 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4638 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm2[0],xmm14[1],xmm2[1]
4639 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3]
4640 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
4641 ; AVX2-NEXT: # ymm0 = mem[0,1,1,3]
4642 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4643 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
4644 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3]
4645 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4646 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4]
4647 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
4648 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,0,4,5,6,4]
4649 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm13 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
4650 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
4651 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
4652 ; AVX2-NEXT: # ymm0 = mem[0,1,1,3]
4653 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4654 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
4655 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3]
4656 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4657 ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,2,2,3,4,6,6,7]
4658 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
4659 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
4660 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
4661 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
4662 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7]
4663 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
4664 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4665 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4666 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[1,1,1,1]
4667 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4668 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3]
4669 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4670 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4671 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm13[0],xmm2[1],xmm13[1]
4672 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
4673 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
4674 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
4675 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5,6],ymm5[7]
4676 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
4677 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
4678 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7]
4679 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
4680 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
4681 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4682 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[1,1,1,1]
4683 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4684 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3]
4685 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4686 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
4687 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
4688 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
4689 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
4690 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
4691 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
4692 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
4693 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
4694 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
4695 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4696 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4697 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[2,2,2,2]
4698 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
4699 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm15[2],xmm6[2],xmm15[3],xmm6[3]
4700 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
4701 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
4702 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
4703 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
4704 ; AVX2-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
4705 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
4706 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
4707 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
4708 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
4709 ; AVX2-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
4710 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
4711 ; AVX2-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7]
4712 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm8 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
4713 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm5[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
4714 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
4715 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
4716 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4717 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4718 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,2,2,2]
4719 ; AVX2-NEXT: vmovdqa %xmm11, %xmm7
4720 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
4721 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm12[2],xmm14[3],xmm12[3]
4722 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm1[0,1],xmm0[2,3]
4723 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
4724 ; AVX2-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7]
4725 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
4726 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
4727 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
4728 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm10 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
4729 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7]
4730 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
4731 ; AVX2-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
4732 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
4733 ; AVX2-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
4734 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm13 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
4735 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
4736 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
4737 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7]
4738 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
4739 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4740 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
4741 ; AVX2-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3]
4742 ; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[2,3,2,3]
4743 ; AVX2-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4744 ; AVX2-NEXT: # xmm15 = mem[3,3,3,3]
4745 ; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2,3]
4746 ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3]
4747 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
4748 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
4749 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
4750 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
4751 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
4752 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
4753 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
4754 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
4755 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload
4756 ; AVX2-NEXT: # xmm3 = xmm7[2],mem[2],xmm7[3],mem[3]
4757 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,3,2,3]
4758 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[3,3,3,3]
4759 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
4760 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
4761 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
4762 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
4763 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
4764 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
4765 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
4766 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7]
4767 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4768 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
4769 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4770 ; AVX2-NEXT: vmovaps %ymm1, (%rsi)
4771 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4772 ; AVX2-NEXT: vmovaps %ymm1, 32(%rsi)
4773 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4774 ; AVX2-NEXT: vmovaps %ymm1, (%rdx)
4775 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4776 ; AVX2-NEXT: vmovaps %ymm1, 32(%rdx)
4777 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4778 ; AVX2-NEXT: vmovaps %ymm1, (%rcx)
4779 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4780 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
4781 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4782 ; AVX2-NEXT: vmovaps %ymm1, (%r8)
4783 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4784 ; AVX2-NEXT: vmovaps %ymm1, 32(%r8)
4785 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4786 ; AVX2-NEXT: vmovaps %ymm1, (%r9)
4787 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4788 ; AVX2-NEXT: vmovaps %ymm1, 32(%r9)
4789 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
4790 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4791 ; AVX2-NEXT: vmovaps %ymm1, (%rax)
4792 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4793 ; AVX2-NEXT: vmovaps %ymm1, 32(%rax)
4794 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
4795 ; AVX2-NEXT: vmovdqa %ymm8, (%rax)
4796 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4797 ; AVX2-NEXT: vmovaps %ymm1, 32(%rax)
4798 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
4799 ; AVX2-NEXT: vmovdqa %ymm0, (%rax)
4800 ; AVX2-NEXT: vmovdqa %ymm2, 32(%rax)
4801 ; AVX2-NEXT: addq $1000, %rsp # imm = 0x3E8
4802 ; AVX2-NEXT: vzeroupper
4805 ; AVX2-FP-LABEL: load_i16_stride8_vf32:
4807 ; AVX2-FP-NEXT: subq $1000, %rsp # imm = 0x3E8
4808 ; AVX2-FP-NEXT: vmovdqa 448(%rdi), %ymm2
4809 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4810 ; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm3
4811 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4812 ; AVX2-FP-NEXT: vmovdqa 272(%rdi), %xmm0
4813 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4814 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm1
4815 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4816 ; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm4
4817 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4818 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm5
4819 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4820 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
4821 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4822 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4823 ; AVX2-FP-NEXT: vmovdqa 368(%rdi), %xmm0
4824 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4825 ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %xmm1
4826 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4827 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4828 ; AVX2-FP-NEXT: vpbroadcastd %xmm9, %xmm0
4829 ; AVX2-FP-NEXT: vmovdqa 336(%rdi), %xmm1
4830 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4831 ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm4
4832 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4833 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
4834 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4835 ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %xmm1
4836 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4837 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
4838 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
4839 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,0,2]
4840 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4841 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
4842 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4843 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
4844 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
4845 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4846 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
4847 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4848 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
4849 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
4850 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm2
4851 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4852 ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm3
4853 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4854 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2]
4855 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4856 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[0,2,2,3,4,6,6,7]
4857 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
4858 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
4859 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4860 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7]
4861 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
4862 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
4863 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
4864 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4865 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4866 ; AVX2-FP-NEXT: vmovdqa 112(%rdi), %xmm0
4867 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4868 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm1
4869 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4870 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4871 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4872 ; AVX2-FP-NEXT: vpbroadcastd %xmm0, %xmm0
4873 ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm1
4874 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4875 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm2
4876 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4877 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
4878 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4879 ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %xmm1
4880 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4881 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1
4882 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4883 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm2
4884 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4885 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm3
4886 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4887 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm4
4888 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4889 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
4890 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
4891 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm15[0],xmm10[1],xmm15[1]
4892 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3]
4893 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm0
4894 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4895 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm1
4896 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4897 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
4898 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4899 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,2,0,4,5,6,4]
4900 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
4901 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
4902 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4903 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4]
4904 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
4905 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm11[7]
4906 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm2
4907 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4908 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm0
4909 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4910 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
4911 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4912 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
4913 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
4914 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
4915 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4916 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
4917 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
4918 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7]
4919 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
4920 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
4921 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4922 ; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4923 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
4924 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4925 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3]
4926 ; AVX2-FP-NEXT: vmovdqa %xmm9, %xmm14
4927 ; AVX2-FP-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill
4928 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4929 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm14[0],xmm9[1],xmm14[1]
4930 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
4931 ; AVX2-FP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
4932 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
4933 ; AVX2-FP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
4934 ; AVX2-FP-NEXT: # ymm6 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
4935 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7]
4936 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
4937 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
4938 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
4939 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
4940 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4941 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4942 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
4943 ; AVX2-FP-NEXT: vmovdqa %xmm10, %xmm11
4944 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3]
4945 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4946 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4947 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
4948 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
4949 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
4950 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
4951 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
4952 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
4953 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
4954 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
4955 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
4956 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4957 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4958 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
4959 ; AVX2-FP-NEXT: vmovdqa %xmm9, %xmm3
4960 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
4961 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm8[2],xmm5[2],xmm8[3],xmm5[3]
4962 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3]
4963 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
4964 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7]
4965 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
4966 ; AVX2-FP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
4967 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
4968 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
4969 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,6],ymm1[7]
4970 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
4971 ; AVX2-FP-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7]
4972 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
4973 ; AVX2-FP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
4974 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
4975 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
4976 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
4977 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
4978 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
4979 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4980 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[2,2,2,2]
4981 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1,2],xmm4[3]
4982 ; AVX2-FP-NEXT: vmovdqa %xmm11, %xmm12
4983 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm15[2],xmm11[3],xmm15[3]
4984 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
4985 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
4986 ; AVX2-FP-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7]
4987 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
4988 ; AVX2-FP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
4989 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
4990 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
4991 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
4992 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
4993 ; AVX2-FP-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7]
4994 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
4995 ; AVX2-FP-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
4996 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
4997 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
4998 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
4999 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7]
5000 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
5001 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5002 ; AVX2-FP-NEXT: vpunpckhdq (%rsp), %xmm3, %xmm4 # 16-byte Folded Reload
5003 ; AVX2-FP-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3]
5004 ; AVX2-FP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5005 ; AVX2-FP-NEXT: # xmm5 = mem[2,3,2,3]
5006 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
5007 ; AVX2-FP-NEXT: # xmm13 = mem[3,3,3,3]
5008 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0],xmm5[1],xmm13[2,3]
5009 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
5010 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
5011 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
5012 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
5013 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
5014 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
5015 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
5016 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5017 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
5018 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5019 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload
5020 ; AVX2-FP-NEXT: # xmm0 = xmm10[2],mem[2],xmm10[3],mem[3]
5021 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[2,3,2,3]
5022 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,3,3,3]
5023 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
5024 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
5025 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
5026 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
5027 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
5028 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
5029 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
5030 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
5031 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5032 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5033 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5034 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5035 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
5036 ; AVX2-FP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5037 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5038 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5039 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
5040 ; AVX2-FP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5041 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5042 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5043 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
5044 ; AVX2-FP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5045 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5046 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5047 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
5048 ; AVX2-FP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5049 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5050 ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %xmm0
5051 ; AVX2-FP-NEXT: vpbroadcastd %xmm4, %xmm1
5052 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5053 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5054 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
5055 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
5056 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3]
5057 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5058 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
5059 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3]
5060 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5061 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,2,0,4,5,6,4]
5062 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
5063 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4]
5064 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
5065 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
5066 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
5067 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3]
5068 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5069 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
5070 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,1,3]
5071 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5072 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,2,3,4,6,6,7]
5073 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
5074 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,2,3,4,6,6,7]
5075 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
5076 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
5077 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
5078 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5079 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5080 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5081 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
5082 ; AVX2-FP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5083 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5084 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5085 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
5086 ; AVX2-FP-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5087 ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %xmm0
5088 ; AVX2-FP-NEXT: vpbroadcastd %xmm11, %xmm1
5089 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5090 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5091 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
5092 ; AVX2-FP-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
5093 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5094 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5095 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload
5096 ; AVX2-FP-NEXT: # xmm14 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
5097 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm2[0],xmm14[1],xmm2[1]
5098 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3]
5099 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
5100 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,1,3]
5101 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5102 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
5103 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3]
5104 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5105 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4]
5106 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
5107 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,0,4,5,6,4]
5108 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
5109 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
5110 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
5111 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,1,3]
5112 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5113 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
5114 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3]
5115 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5116 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,2,2,3,4,6,6,7]
5117 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
5118 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
5119 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
5120 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
5121 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7]
5122 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
5123 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5124 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5125 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[1,1,1,1]
5126 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5127 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3]
5128 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5129 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5130 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm13[0],xmm2[1],xmm13[1]
5131 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
5132 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
5133 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
5134 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5,6],ymm5[7]
5135 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
5136 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
5137 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7]
5138 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
5139 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
5140 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5141 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[1,1,1,1]
5142 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5143 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3]
5144 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5145 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
5146 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
5147 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
5148 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
5149 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
5150 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
5151 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
5152 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
5153 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
5154 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5155 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5156 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[2,2,2,2]
5157 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
5158 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm15[2],xmm6[2],xmm15[3],xmm6[3]
5159 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
5160 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
5161 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
5162 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
5163 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
5164 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
5165 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
5166 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
5167 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
5168 ; AVX2-FP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
5169 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
5170 ; AVX2-FP-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7]
5171 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
5172 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm5[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
5173 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
5174 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
5175 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5176 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5177 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,2,2,2]
5178 ; AVX2-FP-NEXT: vmovdqa %xmm11, %xmm7
5179 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
5180 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm12[2],xmm14[3],xmm12[3]
5181 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm1[0,1],xmm0[2,3]
5182 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
5183 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7]
5184 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
5185 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
5186 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
5187 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
5188 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7]
5189 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
5190 ; AVX2-FP-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
5191 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
5192 ; AVX2-FP-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
5193 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
5194 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
5195 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
5196 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7]
5197 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
5198 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5199 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
5200 ; AVX2-FP-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3]
5201 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[2,3,2,3]
5202 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5203 ; AVX2-FP-NEXT: # xmm15 = mem[3,3,3,3]
5204 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2,3]
5205 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3]
5206 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
5207 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
5208 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
5209 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
5210 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
5211 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
5212 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
5213 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
5214 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload
5215 ; AVX2-FP-NEXT: # xmm3 = xmm7[2],mem[2],xmm7[3],mem[3]
5216 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,3,2,3]
5217 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[3,3,3,3]
5218 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
5219 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
5220 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
5221 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
5222 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
5223 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
5224 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
5225 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7]
5226 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5227 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
5228 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5229 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi)
5230 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5231 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rsi)
5232 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5233 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx)
5234 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5235 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rdx)
5236 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5237 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rcx)
5238 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5239 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rcx)
5240 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5241 ; AVX2-FP-NEXT: vmovaps %ymm1, (%r8)
5242 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5243 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r8)
5244 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5245 ; AVX2-FP-NEXT: vmovaps %ymm1, (%r9)
5246 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5247 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r9)
5248 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5249 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5250 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rax)
5251 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5252 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rax)
5253 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5254 ; AVX2-FP-NEXT: vmovdqa %ymm8, (%rax)
5255 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5256 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rax)
5257 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5258 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rax)
5259 ; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%rax)
5260 ; AVX2-FP-NEXT: addq $1000, %rsp # imm = 0x3E8
5261 ; AVX2-FP-NEXT: vzeroupper
5262 ; AVX2-FP-NEXT: retq
5264 ; AVX2-FCP-LABEL: load_i16_stride8_vf32:
5265 ; AVX2-FCP: # %bb.0:
5266 ; AVX2-FCP-NEXT: subq $1000, %rsp # imm = 0x3E8
5267 ; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm2
5268 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5269 ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm3
5270 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5271 ; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm0
5272 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5273 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm1
5274 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5275 ; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm4
5276 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5277 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm5
5278 ; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5279 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
5280 ; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5281 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
5282 ; AVX2-FCP-NEXT: vmovdqa 368(%rdi), %xmm0
5283 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5284 ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %xmm1
5285 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5286 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
5287 ; AVX2-FCP-NEXT: vpbroadcastd %xmm9, %xmm0
5288 ; AVX2-FCP-NEXT: vmovdqa 336(%rdi), %xmm1
5289 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5290 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm4
5291 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5292 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
5293 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5294 ; AVX2-FCP-NEXT: vpbroadcastd %xmm1, %xmm1
5295 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5296 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
5297 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
5298 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,0,2]
5299 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5300 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
5301 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5302 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
5303 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
5304 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5305 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
5306 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5307 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
5308 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
5309 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm2
5310 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5311 ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm3
5312 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5313 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2]
5314 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5315 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[0,2,2,3,4,6,6,7]
5316 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
5317 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
5318 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5319 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7]
5320 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
5321 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
5322 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
5323 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5324 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5325 ; AVX2-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
5326 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5327 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
5328 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5329 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
5330 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5331 ; AVX2-FCP-NEXT: vpbroadcastd %xmm0, %xmm0
5332 ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm1
5333 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5334 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
5335 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5336 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5337 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5338 ; AVX2-FCP-NEXT: vpbroadcastd %xmm1, %xmm1
5339 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5340 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1
5341 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5342 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
5343 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5344 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
5345 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5346 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm4
5347 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5348 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
5349 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
5350 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm15[0],xmm10[1],xmm15[1]
5351 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3]
5352 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm0
5353 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5354 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm1
5355 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5356 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
5357 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5358 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,2,0,4,5,6,4]
5359 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
5360 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
5361 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5362 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4]
5363 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
5364 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm11[7]
5365 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm2
5366 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5367 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm0
5368 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5369 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
5370 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5371 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
5372 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
5373 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
5374 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5375 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
5376 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
5377 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7]
5378 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
5379 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
5380 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5381 ; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5382 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
5383 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5384 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3]
5385 ; AVX2-FCP-NEXT: vmovdqa %xmm9, %xmm14
5386 ; AVX2-FCP-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill
5387 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5388 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm14[0],xmm9[1],xmm14[1]
5389 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5390 ; AVX2-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
5391 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
5392 ; AVX2-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
5393 ; AVX2-FCP-NEXT: # ymm6 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
5394 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7]
5395 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
5396 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
5397 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
5398 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
5399 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5400 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5401 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
5402 ; AVX2-FCP-NEXT: vmovdqa %xmm10, %xmm11
5403 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3]
5404 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5405 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5406 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
5407 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5408 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
5409 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
5410 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
5411 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
5412 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
5413 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
5414 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5415 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5416 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5417 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
5418 ; AVX2-FCP-NEXT: vmovdqa %xmm9, %xmm3
5419 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
5420 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm8[2],xmm5[2],xmm8[3],xmm5[3]
5421 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3]
5422 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
5423 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7]
5424 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
5425 ; AVX2-FCP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
5426 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
5427 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
5428 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,6],ymm1[7]
5429 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
5430 ; AVX2-FCP-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7]
5431 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
5432 ; AVX2-FCP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
5433 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
5434 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
5435 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
5436 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
5437 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
5438 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5439 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[2,2,2,2]
5440 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1,2],xmm4[3]
5441 ; AVX2-FCP-NEXT: vmovdqa %xmm11, %xmm12
5442 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm15[2],xmm11[3],xmm15[3]
5443 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
5444 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
5445 ; AVX2-FCP-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7]
5446 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
5447 ; AVX2-FCP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
5448 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
5449 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
5450 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
5451 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
5452 ; AVX2-FCP-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7]
5453 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
5454 ; AVX2-FCP-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
5455 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
5456 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
5457 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
5458 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7]
5459 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
5460 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5461 ; AVX2-FCP-NEXT: vpunpckhdq (%rsp), %xmm3, %xmm4 # 16-byte Folded Reload
5462 ; AVX2-FCP-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3]
5463 ; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5464 ; AVX2-FCP-NEXT: # xmm5 = mem[2,3,2,3]
5465 ; AVX2-FCP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
5466 ; AVX2-FCP-NEXT: # xmm13 = mem[3,3,3,3]
5467 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0],xmm5[1],xmm13[2,3]
5468 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
5469 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
5470 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
5471 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
5472 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
5473 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
5474 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
5475 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5476 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
5477 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5478 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload
5479 ; AVX2-FCP-NEXT: # xmm0 = xmm10[2],mem[2],xmm10[3],mem[3]
5480 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[2,3,2,3]
5481 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,3,3,3]
5482 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
5483 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
5484 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
5485 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
5486 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
5487 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
5488 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
5489 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
5490 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5491 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5492 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5493 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5494 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
5495 ; AVX2-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5496 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5497 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5498 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
5499 ; AVX2-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5500 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5501 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5502 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
5503 ; AVX2-FCP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5504 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5505 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5506 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
5507 ; AVX2-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5508 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5509 ; AVX2-FCP-NEXT: vpbroadcastd %xmm1, %xmm0
5510 ; AVX2-FCP-NEXT: vpbroadcastd %xmm4, %xmm1
5511 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5512 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5513 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
5514 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
5515 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3]
5516 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5517 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
5518 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3]
5519 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5520 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,2,0,4,5,6,4]
5521 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
5522 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4]
5523 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
5524 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
5525 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
5526 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3]
5527 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5528 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
5529 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,1,3]
5530 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5531 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,2,3,4,6,6,7]
5532 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
5533 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,2,3,4,6,6,7]
5534 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
5535 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
5536 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
5537 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5538 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5539 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5540 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
5541 ; AVX2-FCP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5542 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5543 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5544 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
5545 ; AVX2-FCP-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5546 ; AVX2-FCP-NEXT: vpbroadcastd %xmm1, %xmm0
5547 ; AVX2-FCP-NEXT: vpbroadcastd %xmm11, %xmm1
5548 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5549 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5550 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
5551 ; AVX2-FCP-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
5552 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5553 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5554 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload
5555 ; AVX2-FCP-NEXT: # xmm14 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
5556 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm2[0],xmm14[1],xmm2[1]
5557 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3]
5558 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
5559 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,1,3]
5560 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5561 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
5562 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3]
5563 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5564 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4]
5565 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
5566 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,0,4,5,6,4]
5567 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
5568 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
5569 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
5570 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,1,3]
5571 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5572 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
5573 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3]
5574 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5575 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,2,2,3,4,6,6,7]
5576 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
5577 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
5578 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
5579 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
5580 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7]
5581 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
5582 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5583 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5584 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[1,1,1,1]
5585 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5586 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3]
5587 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5588 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5589 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm13[0],xmm2[1],xmm13[1]
5590 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
5591 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
5592 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
5593 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5,6],ymm5[7]
5594 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
5595 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
5596 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7]
5597 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
5598 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
5599 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5600 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[1,1,1,1]
5601 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5602 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3]
5603 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5604 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
5605 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
5606 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
5607 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
5608 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
5609 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
5610 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
5611 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
5612 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
5613 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5614 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5615 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[2,2,2,2]
5616 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
5617 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm15[2],xmm6[2],xmm15[3],xmm6[3]
5618 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
5619 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
5620 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
5621 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
5622 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
5623 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
5624 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
5625 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
5626 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
5627 ; AVX2-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
5628 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
5629 ; AVX2-FCP-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7]
5630 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
5631 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm5[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
5632 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
5633 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
5634 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5635 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5636 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,2,2,2]
5637 ; AVX2-FCP-NEXT: vmovdqa %xmm11, %xmm7
5638 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
5639 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm12[2],xmm14[3],xmm12[3]
5640 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm1[0,1],xmm0[2,3]
5641 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
5642 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7]
5643 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
5644 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
5645 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
5646 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
5647 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7]
5648 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
5649 ; AVX2-FCP-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
5650 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
5651 ; AVX2-FCP-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
5652 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
5653 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
5654 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
5655 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7]
5656 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
5657 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5658 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
5659 ; AVX2-FCP-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3]
5660 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[2,3,2,3]
5661 ; AVX2-FCP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5662 ; AVX2-FCP-NEXT: # xmm15 = mem[3,3,3,3]
5663 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2,3]
5664 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3]
5665 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
5666 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
5667 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
5668 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
5669 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
5670 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
5671 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
5672 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
5673 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload
5674 ; AVX2-FCP-NEXT: # xmm3 = xmm7[2],mem[2],xmm7[3],mem[3]
5675 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,3,2,3]
5676 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[3,3,3,3]
5677 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
5678 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
5679 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
5680 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
5681 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
5682 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
5683 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
5684 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7]
5685 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5686 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
5687 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5688 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi)
5689 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5690 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rsi)
5691 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5692 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx)
5693 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5694 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rdx)
5695 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5696 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rcx)
5697 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5698 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rcx)
5699 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5700 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%r8)
5701 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5702 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r8)
5703 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5704 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%r9)
5705 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5706 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r9)
5707 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5708 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5709 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rax)
5710 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5711 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rax)
5712 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5713 ; AVX2-FCP-NEXT: vmovdqa %ymm8, (%rax)
5714 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5715 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rax)
5716 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5717 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax)
5718 ; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rax)
5719 ; AVX2-FCP-NEXT: addq $1000, %rsp # imm = 0x3E8
5720 ; AVX2-FCP-NEXT: vzeroupper
5721 ; AVX2-FCP-NEXT: retq
5723 ; AVX512-LABEL: load_i16_stride8_vf32:
5725 ; AVX512-NEXT: subq $616, %rsp # imm = 0x268
5726 ; AVX512-NEXT: vmovdqa 368(%rdi), %xmm0
5727 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5728 ; AVX512-NEXT: vmovdqa 352(%rdi), %xmm1
5729 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5730 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
5731 ; AVX512-NEXT: vmovdqa 336(%rdi), %xmm0
5732 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5733 ; AVX512-NEXT: vmovdqa 320(%rdi), %xmm1
5734 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5735 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
5736 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
5737 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm27 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
5738 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
5739 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
5740 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm26
5741 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm25 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
5742 ; AVX512-NEXT: vmovdqa %xmm5, %xmm0
5743 ; AVX512-NEXT: vpermt2d %xmm3, %xmm1, %xmm0
5744 ; AVX512-NEXT: vmovdqa 304(%rdi), %xmm2
5745 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5746 ; AVX512-NEXT: vmovdqa 288(%rdi), %xmm3
5747 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5748 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5749 ; AVX512-NEXT: vmovdqa 272(%rdi), %xmm2
5750 ; AVX512-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
5751 ; AVX512-NEXT: vmovdqa 256(%rdi), %xmm3
5752 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5753 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5754 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm11[0],xmm7[1],xmm11[1]
5755 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
5756 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm16
5757 ; AVX512-NEXT: vmovdqa 480(%rdi), %ymm0
5758 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5759 ; AVX512-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2]
5760 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,1,2,0,4,5,6,4]
5761 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
5762 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm21
5763 ; AVX512-NEXT: vmovdqa 448(%rdi), %ymm2
5764 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5765 ; AVX512-NEXT: vpermq {{.*#+}} ymm20 = ymm2[0,1,0,2]
5766 ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm20[0,1,2,0,4,5,6,4]
5767 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
5768 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
5769 ; AVX512-NEXT: vmovdqa 416(%rdi), %ymm2
5770 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5771 ; AVX512-NEXT: vpermq {{.*#+}} ymm22 = ymm2[0,1,0,2]
5772 ; AVX512-NEXT: vmovdqa 384(%rdi), %ymm2
5773 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5774 ; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm2[0,1,0,2]
5775 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm22[0,2,2,3,4,6,6,7]
5776 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
5777 ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[0,2,2,3,4,6,6,7]
5778 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
5779 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5],ymm8[6,7]
5780 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
5781 ; AVX512-NEXT: movb $-64, %al
5782 ; AVX512-NEXT: kmovw %eax, %k1
5783 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 {%k1}
5784 ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm2
5785 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5786 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm0
5787 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5788 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
5789 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm3
5790 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5791 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm2
5792 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5793 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
5794 ; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,2,2]
5795 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm8[3]
5796 ; AVX512-NEXT: vmovdqa64 %xmm3, %xmm23
5797 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm19 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
5798 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm24 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
5799 ; AVX512-NEXT: vpermt2d %xmm0, %xmm1, %xmm2
5800 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
5801 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5802 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
5803 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5804 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm3
5805 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5806 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm4
5807 ; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5808 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
5809 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
5810 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
5811 ; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm15[0,1],xmm2[2,3]
5812 ; AVX512-NEXT: vmovdqa 224(%rdi), %ymm0
5813 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5814 ; AVX512-NEXT: vpermq {{.*#+}} ymm31 = ymm0[0,1,0,2]
5815 ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm0
5816 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5817 ; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm0[0,1,0,2]
5818 ; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm31[0,1,2,0,4,5,6,4]
5819 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
5820 ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm30[0,1,2,0,4,5,6,4]
5821 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm14 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
5822 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm0[7]
5823 ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm0
5824 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5825 ; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm0[0,1,0,2]
5826 ; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm28
5827 ; AVX512-NEXT: vpermq {{.*#+}} ymm17 = ymm28[0,1,0,2]
5828 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,2,2,3,4,6,6,7]
5829 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
5830 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[0,2,2,3,4,6,6,7]
5831 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
5832 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
5833 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
5834 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
5835 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0
5836 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5837 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
5838 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3]
5839 ; AVX512-NEXT: vmovdqa64 %xmm27, %xmm4
5840 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
5841 ; AVX512-NEXT: vmovdqa64 %ymm21, %ymm4
5842 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
5843 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
5844 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
5845 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
5846 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
5847 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7]
5848 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
5849 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
5850 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 {%k1}
5851 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
5852 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
5853 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
5854 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
5855 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
5856 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
5857 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
5858 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1]
5859 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3]
5860 ; AVX512-NEXT: vmovdqa64 %xmm19, %xmm4
5861 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
5862 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5863 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
5864 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5865 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm11[2],xmm7[3],xmm11[3]
5866 ; AVX512-NEXT: vmovdqa64 %xmm26, %xmm1
5867 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5868 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
5869 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[0,1,1,3,4,5,5,7]
5870 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
5871 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[0,1,1,3,4,5,5,7]
5872 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
5873 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7]
5874 ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm22[3,1,2,3,7,5,6,7]
5875 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
5876 ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[3,1,2,3,7,5,6,7]
5877 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm10 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
5878 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
5879 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
5880 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
5881 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[0,1,1,3,4,5,5,7]
5882 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
5883 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm30[0,1,1,3,4,5,5,7]
5884 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm12 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
5885 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5,6],ymm6[7]
5886 ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[3,1,2,3,7,5,6,7]
5887 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
5888 ; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm17[3,1,2,3,7,5,6,7]
5889 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
5890 ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
5891 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7]
5892 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
5893 ; AVX512-NEXT: vmovdqa64 %xmm23, %xmm15
5894 ; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3]
5895 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7]
5896 ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0
5897 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5898 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,7,0,0]
5899 ; AVX512-NEXT: vpermt2d %xmm11, %xmm6, %xmm7
5900 ; AVX512-NEXT: vmovdqa64 %xmm25, %xmm0
5901 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
5902 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
5903 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
5904 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
5905 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
5906 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
5907 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
5908 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
5909 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
5910 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
5911 ; AVX512-NEXT: vpermt2d %xmm8, %xmm6, %xmm3
5912 ; AVX512-NEXT: vmovdqa64 %xmm24, %xmm1
5913 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
5914 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
5915 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
5916 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
5917 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
5918 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
5919 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
5920 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
5921 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
5922 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
5923 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5924 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5925 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5926 ; AVX512-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5927 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5928 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
5929 ; AVX512-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
5930 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
5931 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
5932 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm30
5933 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5934 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm31 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
5935 ; AVX512-NEXT: vmovdqa %xmm1, %xmm2
5936 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
5937 ; AVX512-NEXT: vpermt2d %xmm0, %xmm1, %xmm2
5938 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5939 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
5940 ; AVX512-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5941 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5942 ; AVX512-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm4 # 16-byte Folded Reload
5943 ; AVX512-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5944 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
5945 ; AVX512-NEXT: vmovdqa64 %xmm5, %xmm20
5946 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
5947 ; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm12
5948 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
5949 ; AVX512-NEXT: # ymm19 = mem[0,1,1,3]
5950 ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[0,1,2,0,4,5,6,4]
5951 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
5952 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
5953 ; AVX512-NEXT: # ymm21 = mem[0,1,1,3]
5954 ; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm21[0,1,2,0,4,5,6,4]
5955 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
5956 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
5957 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
5958 ; AVX512-NEXT: # ymm29 = mem[0,1,1,3]
5959 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
5960 ; AVX512-NEXT: # ymm23 = mem[0,1,1,3]
5961 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[0,2,2,3,4,6,6,7]
5962 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
5963 ; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[0,2,2,3,4,6,6,7]
5964 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
5965 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
5966 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
5967 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm12 {%k1}
5968 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5969 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
5970 ; AVX512-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5971 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5972 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
5973 ; AVX512-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5974 ; AVX512-NEXT: vmovdqa %xmm1, %xmm0
5975 ; AVX512-NEXT: vpermi2d %xmm2, %xmm3, %xmm0
5976 ; AVX512-NEXT: vmovdqa64 %xmm3, %xmm16
5977 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm18
5978 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5979 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload
5980 ; AVX512-NEXT: # xmm8 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
5981 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5982 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
5983 ; AVX512-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
5984 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
5985 ; AVX512-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,3]
5986 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
5987 ; AVX512-NEXT: # ymm17 = mem[0,1,1,3]
5988 ; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm17[0,1,2,0,4,5,6,4]
5989 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
5990 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
5991 ; AVX512-NEXT: # ymm24 = mem[0,1,1,3]
5992 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm24[0,1,2,0,4,5,6,4]
5993 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
5994 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm0[7]
5995 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
5996 ; AVX512-NEXT: # ymm25 = mem[0,1,1,3]
5997 ; AVX512-NEXT: vpermq {{.*#+}} ymm26 = ymm28[0,1,1,3]
5998 ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm25[0,2,2,3,4,6,6,7]
5999 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
6000 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[0,2,2,3,4,6,6,7]
6001 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6002 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
6003 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
6004 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
6005 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm27
6006 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
6007 ; AVX512-NEXT: vmovdqa64 %xmm20, %xmm12
6008 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3]
6009 ; AVX512-NEXT: vmovdqa64 %xmm22, %xmm2
6010 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
6011 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
6012 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
6013 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
6014 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6015 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
6016 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
6017 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
6018 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
6019 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
6020 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
6021 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
6022 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
6023 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6024 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
6025 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
6026 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
6027 ; AVX512-NEXT: vmovdqa64 %xmm16, %xmm9
6028 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm16[0],xmm18[0],xmm16[1],xmm18[1]
6029 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,1,1,1]
6030 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3]
6031 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
6032 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6033 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20
6034 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm4[2],xmm12[2],xmm4[3],xmm12[3]
6035 ; AVX512-NEXT: vmovdqa64 %xmm12, %xmm16
6036 ; AVX512-NEXT: vmovdqa64 %xmm30, %xmm1
6037 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6038 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm1
6039 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,1,1,3,4,5,5,7]
6040 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
6041 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[0,1,1,3,4,5,5,7]
6042 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
6043 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6],ymm0[7]
6044 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[3,1,2,3,7,5,6,7]
6045 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
6046 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm23[3,1,2,3,7,5,6,7]
6047 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
6048 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4],ymm7[5],ymm11[6,7]
6049 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
6050 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
6051 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,1,1,3,4,5,5,7]
6052 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
6053 ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm24[0,1,1,3,4,5,5,7]
6054 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm13 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
6055 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5,6],ymm11[7]
6056 ; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[3,1,2,3,7,5,6,7]
6057 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
6058 ; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm26[3,1,2,3,7,5,6,7]
6059 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
6060 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
6061 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
6062 ; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm18[2,2,2,2]
6063 ; AVX512-NEXT: vpblendd {{.*#+}} xmm12 = xmm9[0,1,2],xmm12[3]
6064 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm5[2],xmm8[2],xmm5[3],xmm8[3]
6065 ; AVX512-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
6066 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
6067 ; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1
6068 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm12 = [3,7,0,0]
6069 ; AVX512-NEXT: vpermt2d %xmm16, %xmm12, %xmm4
6070 ; AVX512-NEXT: vmovdqa64 %xmm31, %xmm11
6071 ; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3]
6072 ; AVX512-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4
6073 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
6074 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
6075 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
6076 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
6077 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
6078 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
6079 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
6080 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm4 {%k1}
6081 ; AVX512-NEXT: vpermt2d %xmm8, %xmm12, %xmm5
6082 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm18[2],xmm9[3],xmm18[3]
6083 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
6084 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
6085 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
6086 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
6087 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
6088 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
6089 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
6090 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
6091 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6092 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
6093 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6094 ; AVX512-NEXT: vmovaps %zmm2, (%rsi)
6095 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6096 ; AVX512-NEXT: vmovaps %zmm2, (%rdx)
6097 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6098 ; AVX512-NEXT: vmovaps %zmm2, (%rcx)
6099 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6100 ; AVX512-NEXT: vmovaps %zmm2, (%r8)
6101 ; AVX512-NEXT: vmovdqa64 %zmm27, (%r9)
6102 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
6103 ; AVX512-NEXT: vmovdqa64 %zmm20, (%rax)
6104 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
6105 ; AVX512-NEXT: vmovdqa64 %zmm1, (%rax)
6106 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
6107 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rax)
6108 ; AVX512-NEXT: addq $616, %rsp # imm = 0x268
6109 ; AVX512-NEXT: vzeroupper
6112 ; AVX512-FCP-LABEL: load_i16_stride8_vf32:
6113 ; AVX512-FCP: # %bb.0:
6114 ; AVX512-FCP-NEXT: subq $552, %rsp # imm = 0x228
6115 ; AVX512-FCP-NEXT: vmovdqa 368(%rdi), %xmm0
6116 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6117 ; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %xmm1
6118 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6119 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6120 ; AVX512-FCP-NEXT: vmovdqa 336(%rdi), %xmm0
6121 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6122 ; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %xmm1
6123 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6124 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6125 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
6126 ; AVX512-FCP-NEXT: vmovdqa %xmm3, %xmm0
6127 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm16
6128 ; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm1, %xmm0
6129 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm22
6130 ; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm2
6131 ; AVX512-FCP-NEXT: vmovdqa 304(%rdi), %xmm1
6132 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6133 ; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %xmm3
6134 ; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6135 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
6136 ; AVX512-FCP-NEXT: vmovdqa 272(%rdi), %xmm1
6137 ; AVX512-FCP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
6138 ; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %xmm3
6139 ; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6140 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
6141 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
6142 ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm20
6143 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
6144 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm11
6145 ; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %ymm0
6146 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6147 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm21 = ymm0[0,1,0,2]
6148 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm21[0,1,2,0,4,5,6,4]
6149 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
6150 ; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm1
6151 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6152 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm27 = ymm1[0,1,0,2]
6153 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm27[0,1,2,0,4,5,6,4]
6154 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
6155 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
6156 ; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm1
6157 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6158 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm28 = ymm1[0,1,0,2]
6159 ; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm1
6160 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6161 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2]
6162 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm28[0,2,2,3,4,6,6,7]
6163 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
6164 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[0,2,2,3,4,6,6,7]
6165 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6166 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
6167 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
6168 ; AVX512-FCP-NEXT: movb $-64, %al
6169 ; AVX512-FCP-NEXT: kmovw %eax, %k1
6170 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1}
6171 ; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
6172 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6173 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
6174 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6175 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6176 ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm0
6177 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6178 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
6179 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6180 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6181 ; AVX512-FCP-NEXT: vmovdqa %xmm13, %xmm0
6182 ; AVX512-FCP-NEXT: vpermt2d %xmm3, %xmm2, %xmm0
6183 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm25
6184 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1
6185 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6186 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
6187 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6188 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
6189 ; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6190 ; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm4
6191 ; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6192 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
6193 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
6194 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
6195 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
6196 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3]
6197 ; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm0
6198 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6199 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm30 = ymm0[0,1,0,2]
6200 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm23
6201 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm31 = ymm23[0,1,0,2]
6202 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,2,0,4,5,6,4]
6203 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
6204 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm31[0,1,2,0,4,5,6,4]
6205 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
6206 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm0[7]
6207 ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm0
6208 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6209 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,1,0,2]
6210 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm0
6211 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6212 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2]
6213 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,2,2,3,4,6,6,7]
6214 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
6215 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,2,2,3,4,6,6,7]
6216 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6217 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
6218 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6219 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
6220 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0
6221 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6222 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [1,5,0,0]
6223 ; AVX512-FCP-NEXT: vmovdqa %xmm9, %xmm0
6224 ; AVX512-FCP-NEXT: vpermt2d %xmm20, %xmm11, %xmm0
6225 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm16[0],xmm22[0],xmm16[1],xmm22[1]
6226 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6227 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
6228 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
6229 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
6230 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6231 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
6232 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6,7]
6233 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
6234 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
6235 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
6236 ; AVX512-FCP-NEXT: vmovdqa %xmm6, %xmm1
6237 ; AVX512-FCP-NEXT: vpermt2d %xmm17, %xmm11, %xmm1
6238 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm25[0],xmm13[1],xmm25[1]
6239 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3]
6240 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
6241 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
6242 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
6243 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6244 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
6245 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
6246 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
6247 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
6248 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
6249 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6250 ; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm2
6251 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3]
6252 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6]
6253 ; AVX512-FCP-NEXT: vpermt2d %xmm22, %xmm0, %xmm2
6254 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm20[2],xmm9[3],xmm20[3]
6255 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
6256 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm3
6257 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[0,1,1,3,4,5,5,7]
6258 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
6259 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm27[0,1,1,3,4,5,5,7]
6260 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
6261 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm5[7]
6262 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm28[3,1,2,3,7,5,6,7]
6263 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
6264 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm29[3,1,2,3,7,5,6,7]
6265 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
6266 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7]
6267 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
6268 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm3 {%k1}
6269 ; AVX512-FCP-NEXT: vmovdqa %xmm13, %xmm7
6270 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm13[2],xmm25[2],xmm13[3],xmm25[3]
6271 ; AVX512-FCP-NEXT: vpermt2d %xmm25, %xmm0, %xmm7
6272 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm6[2],xmm17[2],xmm6[3],xmm17[3]
6273 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
6274 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[0,1,1,3,4,5,5,7]
6275 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
6276 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[0,1,1,3,4,5,5,7]
6277 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
6278 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
6279 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[3,1,2,3,7,5,6,7]
6280 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
6281 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,1,2,3,7,5,6,7]
6282 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
6283 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7]
6284 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7]
6285 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
6286 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
6287 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6288 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm27 = [3,7,0,0]
6289 ; AVX512-FCP-NEXT: vpermt2d %xmm20, %xmm27, %xmm9
6290 ; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm0
6291 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
6292 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
6293 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
6294 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
6295 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
6296 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
6297 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
6298 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
6299 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
6300 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
6301 ; AVX512-FCP-NEXT: vpermt2d %xmm17, %xmm27, %xmm6
6302 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm10[2,3]
6303 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
6304 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
6305 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
6306 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
6307 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
6308 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
6309 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
6310 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6311 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28
6312 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6313 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
6314 ; AVX512-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
6315 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6316 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
6317 ; AVX512-FCP-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
6318 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6319 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
6320 ; AVX512-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
6321 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6322 ; AVX512-FCP-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm5 # 16-byte Folded Reload
6323 ; AVX512-FCP-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
6324 ; AVX512-FCP-NEXT: vmovdqa %xmm12, %xmm1
6325 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4]
6326 ; AVX512-FCP-NEXT: vpermt2d %xmm3, %xmm0, %xmm1
6327 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm24
6328 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
6329 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm18
6330 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
6331 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm31
6332 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
6333 ; AVX512-FCP-NEXT: # ymm29 = mem[0,1,1,3]
6334 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[0,1,2,0,4,5,6,4]
6335 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
6336 ; AVX512-FCP-NEXT: # ymm26 = mem[0,1,1,3]
6337 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
6338 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,1,2,0,4,5,6,4]
6339 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
6340 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
6341 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
6342 ; AVX512-FCP-NEXT: # ymm30 = mem[0,1,1,3]
6343 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
6344 ; AVX512-FCP-NEXT: # ymm25 = mem[0,1,1,3]
6345 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm30[0,2,2,3,4,6,6,7]
6346 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
6347 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[0,2,2,3,4,6,6,7]
6348 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6349 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5],ymm7[6,7]
6350 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
6351 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm31 {%k1}
6352 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6353 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
6354 ; AVX512-FCP-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
6355 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6356 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
6357 ; AVX512-FCP-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
6358 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6359 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
6360 ; AVX512-FCP-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
6361 ; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm0
6362 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
6363 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm22
6364 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6365 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload
6366 ; AVX512-FCP-NEXT: # xmm7 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
6367 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
6368 ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm16
6369 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm10[0,1],xmm0[2,3]
6370 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload
6371 ; AVX512-FCP-NEXT: # ymm20 = mem[0,1,1,3]
6372 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm20[0,1,2,0,4,5,6,4]
6373 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,1,1,3]
6374 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
6375 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,1,2,0,4,5,6,4]
6376 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
6377 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
6378 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
6379 ; AVX512-FCP-NEXT: # ymm19 = mem[0,1,1,3]
6380 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
6381 ; AVX512-FCP-NEXT: # ymm21 = mem[0,1,1,3]
6382 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[0,2,2,3,4,6,6,7]
6383 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
6384 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[0,2,2,3,4,6,6,7]
6385 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6386 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
6387 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
6388 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
6389 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm31, %zmm31
6390 ; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm0
6391 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,0,0]
6392 ; AVX512-FCP-NEXT: vpermt2d %xmm18, %xmm4, %xmm0
6393 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm24[0],xmm12[1],xmm24[1]
6394 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6395 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
6396 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
6397 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
6398 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6399 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
6400 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
6401 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
6402 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
6403 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
6404 ; AVX512-FCP-NEXT: vmovdqa %xmm4, %xmm3
6405 ; AVX512-FCP-NEXT: vpermi2d %xmm16, %xmm7, %xmm3
6406 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm22[0],xmm17[1],xmm22[1]
6407 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm3[0,1],xmm1[2,3]
6408 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
6409 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm14[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
6410 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
6411 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6412 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
6413 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
6414 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6415 ; AVX512-FCP-NEXT: vmovdqa %xmm12, %xmm8
6416 ; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm2
6417 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm12[2],xmm24[2],xmm12[3],xmm24[3]
6418 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7]
6419 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24
6420 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6]
6421 ; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm0, %xmm8
6422 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm18[2],xmm5[3],xmm18[3]
6423 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3]
6424 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
6425 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[0,1,1,3,4,5,5,7]
6426 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
6427 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,1,1,3,4,5,5,7]
6428 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
6429 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7]
6430 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm30[3,1,2,3,7,5,6,7]
6431 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
6432 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm25[3,1,2,3,7,5,6,7]
6433 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
6434 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5],ymm12[6,7]
6435 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7]
6436 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1}
6437 ; AVX512-FCP-NEXT: vpermi2d %xmm22, %xmm17, %xmm0
6438 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm7[2],xmm16[2],xmm7[3],xmm16[3]
6439 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3]
6440 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm20[0,1,1,3,4,5,5,7]
6441 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
6442 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[0,1,1,3,4,5,5,7]
6443 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
6444 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7]
6445 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[3,1,2,3,7,5,6,7]
6446 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
6447 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm21[3,1,2,3,7,5,6,7]
6448 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
6449 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm15[5],ymm4[6,7]
6450 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7]
6451 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
6452 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm1
6453 ; AVX512-FCP-NEXT: vpermt2d %xmm18, %xmm27, %xmm5
6454 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm10[2,3]
6455 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3
6456 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
6457 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
6458 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
6459 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
6460 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
6461 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
6462 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
6463 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1}
6464 ; AVX512-FCP-NEXT: vpermt2d %xmm16, %xmm27, %xmm7
6465 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm22[2],xmm17[3],xmm22[3]
6466 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3]
6467 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
6468 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
6469 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
6470 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
6471 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
6472 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
6473 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
6474 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6475 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
6476 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6477 ; AVX512-FCP-NEXT: vmovaps %zmm2, (%rsi)
6478 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6479 ; AVX512-FCP-NEXT: vmovaps %zmm2, (%rdx)
6480 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6481 ; AVX512-FCP-NEXT: vmovaps %zmm2, (%rcx)
6482 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, (%r8)
6483 ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, (%r9)
6484 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6485 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rax)
6486 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6487 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
6488 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6489 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
6490 ; AVX512-FCP-NEXT: addq $552, %rsp # imm = 0x228
6491 ; AVX512-FCP-NEXT: vzeroupper
6492 ; AVX512-FCP-NEXT: retq
6494 ; AVX512DQ-LABEL: load_i16_stride8_vf32:
6495 ; AVX512DQ: # %bb.0:
6496 ; AVX512DQ-NEXT: subq $616, %rsp # imm = 0x268
6497 ; AVX512DQ-NEXT: vmovdqa 368(%rdi), %xmm0
6498 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6499 ; AVX512DQ-NEXT: vmovdqa 352(%rdi), %xmm1
6500 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6501 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6502 ; AVX512DQ-NEXT: vmovdqa 336(%rdi), %xmm0
6503 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6504 ; AVX512DQ-NEXT: vmovdqa 320(%rdi), %xmm1
6505 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6506 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6507 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
6508 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm27 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
6509 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
6510 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
6511 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm26
6512 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm25 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
6513 ; AVX512DQ-NEXT: vmovdqa %xmm5, %xmm0
6514 ; AVX512DQ-NEXT: vpermt2d %xmm3, %xmm1, %xmm0
6515 ; AVX512DQ-NEXT: vmovdqa 304(%rdi), %xmm2
6516 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6517 ; AVX512DQ-NEXT: vmovdqa 288(%rdi), %xmm3
6518 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6519 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
6520 ; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm2
6521 ; AVX512DQ-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
6522 ; AVX512DQ-NEXT: vmovdqa 256(%rdi), %xmm3
6523 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6524 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
6525 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm11[0],xmm7[1],xmm11[1]
6526 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
6527 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm16
6528 ; AVX512DQ-NEXT: vmovdqa 480(%rdi), %ymm0
6529 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6530 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2]
6531 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,1,2,0,4,5,6,4]
6532 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
6533 ; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm21
6534 ; AVX512DQ-NEXT: vmovdqa 448(%rdi), %ymm2
6535 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6536 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm20 = ymm2[0,1,0,2]
6537 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm20[0,1,2,0,4,5,6,4]
6538 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
6539 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
6540 ; AVX512DQ-NEXT: vmovdqa 416(%rdi), %ymm2
6541 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6542 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm22 = ymm2[0,1,0,2]
6543 ; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm2
6544 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6545 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm2[0,1,0,2]
6546 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm22[0,2,2,3,4,6,6,7]
6547 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
6548 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[0,2,2,3,4,6,6,7]
6549 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6550 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5],ymm8[6,7]
6551 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
6552 ; AVX512DQ-NEXT: movb $-64, %al
6553 ; AVX512DQ-NEXT: kmovw %eax, %k1
6554 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 {%k1}
6555 ; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm2
6556 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6557 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm0
6558 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6559 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
6560 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm3
6561 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6562 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm2
6563 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6564 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
6565 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,2,2]
6566 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm8[3]
6567 ; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm23
6568 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm19 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
6569 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm24 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
6570 ; AVX512DQ-NEXT: vpermt2d %xmm0, %xmm1, %xmm2
6571 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
6572 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6573 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
6574 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6575 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm3
6576 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6577 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm4
6578 ; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6579 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
6580 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
6581 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
6582 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm15[0,1],xmm2[2,3]
6583 ; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm0
6584 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6585 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm31 = ymm0[0,1,0,2]
6586 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm0
6587 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6588 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm0[0,1,0,2]
6589 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm31[0,1,2,0,4,5,6,4]
6590 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
6591 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm30[0,1,2,0,4,5,6,4]
6592 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm14 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
6593 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm0[7]
6594 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm0
6595 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6596 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm0[0,1,0,2]
6597 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm28
6598 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm17 = ymm28[0,1,0,2]
6599 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,2,2,3,4,6,6,7]
6600 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
6601 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[0,2,2,3,4,6,6,7]
6602 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6603 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
6604 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
6605 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
6606 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0
6607 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6608 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
6609 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3]
6610 ; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm4
6611 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
6612 ; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm4
6613 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
6614 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
6615 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
6616 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6617 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
6618 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7]
6619 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
6620 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
6621 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 {%k1}
6622 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
6623 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
6624 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
6625 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6626 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
6627 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
6628 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
6629 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1]
6630 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3]
6631 ; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm4
6632 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
6633 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6634 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
6635 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6636 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm11[2],xmm7[3],xmm11[3]
6637 ; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm1
6638 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6639 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
6640 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[0,1,1,3,4,5,5,7]
6641 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
6642 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[0,1,1,3,4,5,5,7]
6643 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
6644 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7]
6645 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm22[3,1,2,3,7,5,6,7]
6646 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
6647 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[3,1,2,3,7,5,6,7]
6648 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
6649 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
6650 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
6651 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
6652 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[0,1,1,3,4,5,5,7]
6653 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
6654 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm30[0,1,1,3,4,5,5,7]
6655 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm12 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
6656 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5,6],ymm6[7]
6657 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[3,1,2,3,7,5,6,7]
6658 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
6659 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm17[3,1,2,3,7,5,6,7]
6660 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
6661 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
6662 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7]
6663 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
6664 ; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm15
6665 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3]
6666 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7]
6667 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0
6668 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6669 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,7,0,0]
6670 ; AVX512DQ-NEXT: vpermt2d %xmm11, %xmm6, %xmm7
6671 ; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm0
6672 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
6673 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
6674 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
6675 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
6676 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
6677 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
6678 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
6679 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
6680 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
6681 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
6682 ; AVX512DQ-NEXT: vpermt2d %xmm8, %xmm6, %xmm3
6683 ; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm1
6684 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
6685 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
6686 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
6687 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
6688 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
6689 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
6690 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
6691 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
6692 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
6693 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
6694 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6695 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6696 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
6697 ; AVX512DQ-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
6698 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6699 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
6700 ; AVX512DQ-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
6701 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
6702 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
6703 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm30
6704 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6705 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm31 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6706 ; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm2
6707 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
6708 ; AVX512DQ-NEXT: vpermt2d %xmm0, %xmm1, %xmm2
6709 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6710 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
6711 ; AVX512DQ-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
6712 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6713 ; AVX512DQ-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm4 # 16-byte Folded Reload
6714 ; AVX512DQ-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
6715 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
6716 ; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm20
6717 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
6718 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm12
6719 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
6720 ; AVX512DQ-NEXT: # ymm19 = mem[0,1,1,3]
6721 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[0,1,2,0,4,5,6,4]
6722 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
6723 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
6724 ; AVX512DQ-NEXT: # ymm21 = mem[0,1,1,3]
6725 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm21[0,1,2,0,4,5,6,4]
6726 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
6727 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
6728 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
6729 ; AVX512DQ-NEXT: # ymm29 = mem[0,1,1,3]
6730 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
6731 ; AVX512DQ-NEXT: # ymm23 = mem[0,1,1,3]
6732 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[0,2,2,3,4,6,6,7]
6733 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
6734 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[0,2,2,3,4,6,6,7]
6735 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6736 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
6737 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
6738 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm12 {%k1}
6739 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6740 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
6741 ; AVX512DQ-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
6742 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6743 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
6744 ; AVX512DQ-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
6745 ; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm0
6746 ; AVX512DQ-NEXT: vpermi2d %xmm2, %xmm3, %xmm0
6747 ; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm16
6748 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm18
6749 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6750 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload
6751 ; AVX512DQ-NEXT: # xmm8 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
6752 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6753 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
6754 ; AVX512DQ-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
6755 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
6756 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,3]
6757 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
6758 ; AVX512DQ-NEXT: # ymm17 = mem[0,1,1,3]
6759 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm17[0,1,2,0,4,5,6,4]
6760 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
6761 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
6762 ; AVX512DQ-NEXT: # ymm24 = mem[0,1,1,3]
6763 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm24[0,1,2,0,4,5,6,4]
6764 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
6765 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm0[7]
6766 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
6767 ; AVX512DQ-NEXT: # ymm25 = mem[0,1,1,3]
6768 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm26 = ymm28[0,1,1,3]
6769 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm25[0,2,2,3,4,6,6,7]
6770 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
6771 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[0,2,2,3,4,6,6,7]
6772 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6773 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
6774 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
6775 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
6776 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm27
6777 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
6778 ; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm12
6779 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3]
6780 ; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm2
6781 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
6782 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
6783 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
6784 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
6785 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6786 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
6787 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
6788 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
6789 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
6790 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
6791 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
6792 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
6793 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
6794 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6795 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
6796 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
6797 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
6798 ; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm9
6799 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm16[0],xmm18[0],xmm16[1],xmm18[1]
6800 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,1,1,1]
6801 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3]
6802 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
6803 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6804 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20
6805 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm4[2],xmm12[2],xmm4[3],xmm12[3]
6806 ; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm16
6807 ; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm1
6808 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6809 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm1
6810 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,1,1,3,4,5,5,7]
6811 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
6812 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[0,1,1,3,4,5,5,7]
6813 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
6814 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6],ymm0[7]
6815 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[3,1,2,3,7,5,6,7]
6816 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
6817 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm23[3,1,2,3,7,5,6,7]
6818 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
6819 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4],ymm7[5],ymm11[6,7]
6820 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
6821 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
6822 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,1,1,3,4,5,5,7]
6823 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
6824 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm24[0,1,1,3,4,5,5,7]
6825 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm13 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
6826 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5,6],ymm11[7]
6827 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[3,1,2,3,7,5,6,7]
6828 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
6829 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm26[3,1,2,3,7,5,6,7]
6830 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
6831 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
6832 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
6833 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm18[2,2,2,2]
6834 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm12 = xmm9[0,1,2],xmm12[3]
6835 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm5[2],xmm8[2],xmm5[3],xmm8[3]
6836 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
6837 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
6838 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1
6839 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm12 = [3,7,0,0]
6840 ; AVX512DQ-NEXT: vpermt2d %xmm16, %xmm12, %xmm4
6841 ; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm11
6842 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3]
6843 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4
6844 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
6845 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
6846 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
6847 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
6848 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
6849 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
6850 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
6851 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm4 {%k1}
6852 ; AVX512DQ-NEXT: vpermt2d %xmm8, %xmm12, %xmm5
6853 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm18[2],xmm9[3],xmm18[3]
6854 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
6855 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
6856 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
6857 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
6858 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
6859 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
6860 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
6861 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
6862 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6863 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
6864 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6865 ; AVX512DQ-NEXT: vmovaps %zmm2, (%rsi)
6866 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6867 ; AVX512DQ-NEXT: vmovaps %zmm2, (%rdx)
6868 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6869 ; AVX512DQ-NEXT: vmovaps %zmm2, (%rcx)
6870 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6871 ; AVX512DQ-NEXT: vmovaps %zmm2, (%r8)
6872 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, (%r9)
6873 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
6874 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%rax)
6875 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
6876 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax)
6877 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
6878 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax)
6879 ; AVX512DQ-NEXT: addq $616, %rsp # imm = 0x268
6880 ; AVX512DQ-NEXT: vzeroupper
6881 ; AVX512DQ-NEXT: retq
6883 ; AVX512DQ-FCP-LABEL: load_i16_stride8_vf32:
6884 ; AVX512DQ-FCP: # %bb.0:
6885 ; AVX512DQ-FCP-NEXT: subq $552, %rsp # imm = 0x228
6886 ; AVX512DQ-FCP-NEXT: vmovdqa 368(%rdi), %xmm0
6887 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6888 ; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %xmm1
6889 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6890 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6891 ; AVX512DQ-FCP-NEXT: vmovdqa 336(%rdi), %xmm0
6892 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6893 ; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %xmm1
6894 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6895 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6896 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
6897 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, %xmm0
6898 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm16
6899 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm1, %xmm0
6900 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm22
6901 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm2
6902 ; AVX512DQ-FCP-NEXT: vmovdqa 304(%rdi), %xmm1
6903 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6904 ; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %xmm3
6905 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6906 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
6907 ; AVX512DQ-FCP-NEXT: vmovdqa 272(%rdi), %xmm1
6908 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
6909 ; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %xmm3
6910 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6911 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
6912 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
6913 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm20
6914 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
6915 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm11
6916 ; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %ymm0
6917 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6918 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm21 = ymm0[0,1,0,2]
6919 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm21[0,1,2,0,4,5,6,4]
6920 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
6921 ; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm1
6922 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6923 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm27 = ymm1[0,1,0,2]
6924 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm27[0,1,2,0,4,5,6,4]
6925 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
6926 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
6927 ; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm1
6928 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6929 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm28 = ymm1[0,1,0,2]
6930 ; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm1
6931 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6932 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2]
6933 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm28[0,2,2,3,4,6,6,7]
6934 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
6935 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[0,2,2,3,4,6,6,7]
6936 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6937 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
6938 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
6939 ; AVX512DQ-FCP-NEXT: movb $-64, %al
6940 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
6941 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1}
6942 ; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
6943 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6944 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
6945 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6946 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6947 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm0
6948 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6949 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
6950 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6951 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6952 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm13, %xmm0
6953 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm3, %xmm2, %xmm0
6954 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm25
6955 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1
6956 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6957 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
6958 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6959 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
6960 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6961 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm4
6962 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6963 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
6964 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
6965 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
6966 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
6967 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3]
6968 ; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm0
6969 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6970 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm30 = ymm0[0,1,0,2]
6971 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm23
6972 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm31 = ymm23[0,1,0,2]
6973 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,2,0,4,5,6,4]
6974 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
6975 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm31[0,1,2,0,4,5,6,4]
6976 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
6977 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm0[7]
6978 ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm0
6979 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6980 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,1,0,2]
6981 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm0
6982 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6983 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2]
6984 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,2,2,3,4,6,6,7]
6985 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
6986 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,2,2,3,4,6,6,7]
6987 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6988 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
6989 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6990 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
6991 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0
6992 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6993 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [1,5,0,0]
6994 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, %xmm0
6995 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm20, %xmm11, %xmm0
6996 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm16[0],xmm22[0],xmm16[1],xmm22[1]
6997 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6998 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
6999 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
7000 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
7001 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
7002 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
7003 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6,7]
7004 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
7005 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
7006 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
7007 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, %xmm1
7008 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm17, %xmm11, %xmm1
7009 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm25[0],xmm13[1],xmm25[1]
7010 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3]
7011 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
7012 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
7013 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
7014 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
7015 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
7016 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
7017 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
7018 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
7019 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
7020 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7021 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm2
7022 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3]
7023 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6]
7024 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm22, %xmm0, %xmm2
7025 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm20[2],xmm9[3],xmm20[3]
7026 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
7027 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm3
7028 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[0,1,1,3,4,5,5,7]
7029 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
7030 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm27[0,1,1,3,4,5,5,7]
7031 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
7032 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm5[7]
7033 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm28[3,1,2,3,7,5,6,7]
7034 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
7035 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm29[3,1,2,3,7,5,6,7]
7036 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
7037 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7]
7038 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
7039 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm3 {%k1}
7040 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm13, %xmm7
7041 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm13[2],xmm25[2],xmm13[3],xmm25[3]
7042 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm25, %xmm0, %xmm7
7043 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm6[2],xmm17[2],xmm6[3],xmm17[3]
7044 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
7045 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[0,1,1,3,4,5,5,7]
7046 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
7047 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[0,1,1,3,4,5,5,7]
7048 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
7049 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
7050 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[3,1,2,3,7,5,6,7]
7051 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
7052 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,1,2,3,7,5,6,7]
7053 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
7054 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7]
7055 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7]
7056 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
7057 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
7058 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7059 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm27 = [3,7,0,0]
7060 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm20, %xmm27, %xmm9
7061 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm0
7062 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
7063 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
7064 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
7065 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
7066 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
7067 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
7068 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
7069 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
7070 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
7071 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
7072 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm17, %xmm27, %xmm6
7073 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm10[2,3]
7074 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
7075 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
7076 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
7077 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
7078 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
7079 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
7080 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
7081 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
7082 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28
7083 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7084 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
7085 ; AVX512DQ-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
7086 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7087 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
7088 ; AVX512DQ-FCP-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
7089 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7090 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
7091 ; AVX512DQ-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
7092 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7093 ; AVX512DQ-FCP-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm5 # 16-byte Folded Reload
7094 ; AVX512DQ-FCP-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
7095 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, %xmm1
7096 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4]
7097 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm3, %xmm0, %xmm1
7098 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm24
7099 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
7100 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm18
7101 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
7102 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm31
7103 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
7104 ; AVX512DQ-FCP-NEXT: # ymm29 = mem[0,1,1,3]
7105 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[0,1,2,0,4,5,6,4]
7106 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
7107 ; AVX512DQ-FCP-NEXT: # ymm26 = mem[0,1,1,3]
7108 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
7109 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,1,2,0,4,5,6,4]
7110 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
7111 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
7112 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
7113 ; AVX512DQ-FCP-NEXT: # ymm30 = mem[0,1,1,3]
7114 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
7115 ; AVX512DQ-FCP-NEXT: # ymm25 = mem[0,1,1,3]
7116 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm30[0,2,2,3,4,6,6,7]
7117 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
7118 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[0,2,2,3,4,6,6,7]
7119 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
7120 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5],ymm7[6,7]
7121 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
7122 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm31 {%k1}
7123 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7124 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
7125 ; AVX512DQ-FCP-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
7126 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7127 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
7128 ; AVX512DQ-FCP-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
7129 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7130 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
7131 ; AVX512DQ-FCP-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
7132 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm0
7133 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
7134 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm22
7135 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7136 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload
7137 ; AVX512DQ-FCP-NEXT: # xmm7 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
7138 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
7139 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm16
7140 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm10[0,1],xmm0[2,3]
7141 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload
7142 ; AVX512DQ-FCP-NEXT: # ymm20 = mem[0,1,1,3]
7143 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm20[0,1,2,0,4,5,6,4]
7144 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,1,1,3]
7145 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
7146 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,1,2,0,4,5,6,4]
7147 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
7148 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
7149 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
7150 ; AVX512DQ-FCP-NEXT: # ymm19 = mem[0,1,1,3]
7151 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
7152 ; AVX512DQ-FCP-NEXT: # ymm21 = mem[0,1,1,3]
7153 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[0,2,2,3,4,6,6,7]
7154 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
7155 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[0,2,2,3,4,6,6,7]
7156 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
7157 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
7158 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
7159 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
7160 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm31, %zmm31
7161 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm0
7162 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,0,0]
7163 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm18, %xmm4, %xmm0
7164 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm24[0],xmm12[1],xmm24[1]
7165 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
7166 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
7167 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
7168 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
7169 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
7170 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
7171 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
7172 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
7173 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
7174 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
7175 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, %xmm3
7176 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm16, %xmm7, %xmm3
7177 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm22[0],xmm17[1],xmm22[1]
7178 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm3[0,1],xmm1[2,3]
7179 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
7180 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm14[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
7181 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
7182 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
7183 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
7184 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
7185 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm1[6,7]
7186 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, %xmm8
7187 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm2
7188 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm12[2],xmm24[2],xmm12[3],xmm24[3]
7189 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7]
7190 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24
7191 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6]
7192 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm0, %xmm8
7193 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm18[2],xmm5[3],xmm18[3]
7194 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3]
7195 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
7196 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[0,1,1,3,4,5,5,7]
7197 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
7198 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,1,1,3,4,5,5,7]
7199 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
7200 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7]
7201 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm30[3,1,2,3,7,5,6,7]
7202 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
7203 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm25[3,1,2,3,7,5,6,7]
7204 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
7205 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5],ymm12[6,7]
7206 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7]
7207 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1}
7208 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm22, %xmm17, %xmm0
7209 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm7[2],xmm16[2],xmm7[3],xmm16[3]
7210 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3]
7211 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm20[0,1,1,3,4,5,5,7]
7212 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
7213 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[0,1,1,3,4,5,5,7]
7214 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
7215 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7]
7216 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[3,1,2,3,7,5,6,7]
7217 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
7218 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm21[3,1,2,3,7,5,6,7]
7219 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
7220 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm15[5],ymm4[6,7]
7221 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7]
7222 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
7223 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm1
7224 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm18, %xmm27, %xmm5
7225 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm10[2,3]
7226 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3
7227 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
7228 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
7229 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
7230 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
7231 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
7232 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
7233 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
7234 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1}
7235 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm16, %xmm27, %xmm7
7236 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm22[2],xmm17[3],xmm22[3]
7237 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3]
7238 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
7239 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
7240 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
7241 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
7242 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
7243 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
7244 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
7245 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
7246 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
7247 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
7248 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rsi)
7249 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
7250 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rdx)
7251 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
7252 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rcx)
7253 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, (%r8)
7254 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, (%r9)
7255 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7256 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%rax)
7257 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7258 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
7259 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7260 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
7261 ; AVX512DQ-FCP-NEXT: addq $552, %rsp # imm = 0x228
7262 ; AVX512DQ-FCP-NEXT: vzeroupper
7263 ; AVX512DQ-FCP-NEXT: retq
7265 ; AVX512BW-LABEL: load_i16_stride8_vf32:
7266 ; AVX512BW: # %bb.0:
7267 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
7268 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
7269 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
7270 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
7271 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2
7272 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1
7273 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4
7274 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm5
7275 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3
7276 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7
7277 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6
7278 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
7279 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7280 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9
7281 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm9
7282 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
7283 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm8, %zmm10
7284 ; AVX512BW-NEXT: movb $-64, %dil
7285 ; AVX512BW-NEXT: kmovd %edi, %k1
7286 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
7287 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9
7288 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm9
7289 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm8
7290 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
7291 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
7292 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
7293 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7294 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10
7295 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm9, %zmm10
7296 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11
7297 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm9, %zmm11
7298 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1}
7299 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10
7300 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm9, %zmm10
7301 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm9
7302 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
7303 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
7304 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
7305 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7306 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11
7307 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm11
7308 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12
7309 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm10, %zmm12
7310 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1}
7311 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11
7312 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm11
7313 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm10
7314 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
7315 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10
7316 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
7317 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7318 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12
7319 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm11, %zmm12
7320 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13
7321 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm13
7322 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1}
7323 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12
7324 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm11, %zmm12
7325 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm11
7326 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
7327 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11
7328 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
7329 ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7330 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13
7331 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm12, %zmm13
7332 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14
7333 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm12, %zmm14
7334 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
7335 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13
7336 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm12, %zmm13
7337 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm12
7338 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
7339 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12
7340 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
7341 ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7342 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14
7343 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm14
7344 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15
7345 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm15
7346 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
7347 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14
7348 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm13, %zmm14
7349 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm13
7350 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
7351 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
7352 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
7353 ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7354 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15
7355 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm14, %zmm15
7356 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm16
7357 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm14, %zmm16
7358 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1}
7359 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15
7360 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm14, %zmm15
7361 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm14
7362 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
7363 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14
7364 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
7365 ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7366 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm6
7367 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm15, %zmm3
7368 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
7369 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm15, %zmm1
7370 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm15, %zmm0
7371 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7372 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
7373 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi)
7374 ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx)
7375 ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rcx)
7376 ; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r8)
7377 ; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r9)
7378 ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r11)
7379 ; AVX512BW-NEXT: vmovdqa64 %zmm14, (%r10)
7380 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax)
7381 ; AVX512BW-NEXT: vzeroupper
7382 ; AVX512BW-NEXT: retq
7384 ; AVX512BW-FCP-LABEL: load_i16_stride8_vf32:
7385 ; AVX512BW-FCP: # %bb.0:
7386 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7387 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
7388 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
7389 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
7390 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
7391 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
7392 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4
7393 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5
7394 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3
7395 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
7396 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6
7397 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
7398 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7399 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9
7400 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm8, %zmm9
7401 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
7402 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm8, %zmm10
7403 ; AVX512BW-FCP-NEXT: movb $-64, %dil
7404 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1
7405 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
7406 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9
7407 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm9
7408 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm8
7409 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
7410 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
7411 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
7412 ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7413 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10
7414 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm9, %zmm10
7415 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11
7416 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm9, %zmm11
7417 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1}
7418 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
7419 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm9, %zmm10
7420 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm9
7421 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
7422 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
7423 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
7424 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7425 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
7426 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm10, %zmm11
7427 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
7428 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm10, %zmm12
7429 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1}
7430 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11
7431 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm11
7432 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm10
7433 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
7434 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10
7435 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
7436 ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7437 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12
7438 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm11, %zmm12
7439 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
7440 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm11, %zmm13
7441 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1}
7442 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12
7443 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm11, %zmm12
7444 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm11
7445 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
7446 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11
7447 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
7448 ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7449 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
7450 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm12, %zmm13
7451 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14
7452 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm12, %zmm14
7453 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
7454 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13
7455 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm12, %zmm13
7456 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm12
7457 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
7458 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12
7459 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
7460 ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7461 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14
7462 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm13, %zmm14
7463 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15
7464 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm13, %zmm15
7465 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
7466 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14
7467 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm13, %zmm14
7468 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm13
7469 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
7470 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
7471 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
7472 ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7473 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15
7474 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm14, %zmm15
7475 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm16
7476 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm14, %zmm16
7477 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1}
7478 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15
7479 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm14, %zmm15
7480 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm14
7481 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
7482 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14
7483 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
7484 ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7485 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm15, %zmm6
7486 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm15, %zmm3
7487 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
7488 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm15, %zmm1
7489 ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm15, %zmm0
7490 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7491 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
7492 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi)
7493 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdx)
7494 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx)
7495 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%r8)
7496 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%r9)
7497 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%r11)
7498 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%r10)
7499 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
7500 ; AVX512BW-FCP-NEXT: vzeroupper
7501 ; AVX512BW-FCP-NEXT: retq
7503 ; AVX512DQ-BW-LABEL: load_i16_stride8_vf32:
7504 ; AVX512DQ-BW: # %bb.0:
7505 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
7506 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
7507 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
7508 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
7509 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2
7510 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1
7511 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm4
7512 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm5
7513 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm3
7514 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm7
7515 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm6
7516 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
7517 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7518 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm9
7519 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm9
7520 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10
7521 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm8, %zmm10
7522 ; AVX512DQ-BW-NEXT: movb $-64, %dil
7523 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1
7524 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
7525 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm9
7526 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm9
7527 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm8
7528 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
7529 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
7530 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
7531 ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7532 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10
7533 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm9, %zmm10
7534 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm11
7535 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm9, %zmm11
7536 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1}
7537 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10
7538 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm9, %zmm10
7539 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm9
7540 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
7541 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
7542 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
7543 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7544 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11
7545 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm11
7546 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12
7547 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm10, %zmm12
7548 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1}
7549 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11
7550 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm11
7551 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm10
7552 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
7553 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10
7554 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
7555 ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7556 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12
7557 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm11, %zmm12
7558 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13
7559 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm13
7560 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1}
7561 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm12
7562 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm11, %zmm12
7563 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm11
7564 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
7565 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11
7566 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
7567 ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7568 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13
7569 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm12, %zmm13
7570 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm14
7571 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm12, %zmm14
7572 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
7573 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13
7574 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm12, %zmm13
7575 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm12
7576 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
7577 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12
7578 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
7579 ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7580 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm14
7581 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm14
7582 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm15
7583 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm15
7584 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
7585 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm14
7586 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm13, %zmm14
7587 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm13
7588 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
7589 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
7590 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
7591 ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7592 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm15
7593 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm14, %zmm15
7594 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm16
7595 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm14, %zmm16
7596 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1}
7597 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm15
7598 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm14, %zmm15
7599 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm14
7600 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
7601 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14
7602 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
7603 ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7604 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm6
7605 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm15, %zmm3
7606 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
7607 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm15, %zmm1
7608 ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm15, %zmm0
7609 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7610 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
7611 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rsi)
7612 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rdx)
7613 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rcx)
7614 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, (%r8)
7615 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%r9)
7616 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%r11)
7617 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%r10)
7618 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax)
7619 ; AVX512DQ-BW-NEXT: vzeroupper
7620 ; AVX512DQ-BW-NEXT: retq
7622 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride8_vf32:
7623 ; AVX512DQ-BW-FCP: # %bb.0:
7624 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7625 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
7626 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
7627 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
7628 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
7629 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
7630 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4
7631 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5
7632 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3
7633 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
7634 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6
7635 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
7636 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7637 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9
7638 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm8, %zmm9
7639 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
7640 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm8, %zmm10
7641 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %dil
7642 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1
7643 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
7644 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9
7645 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm9
7646 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm8
7647 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
7648 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
7649 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
7650 ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7651 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10
7652 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm9, %zmm10
7653 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11
7654 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm9, %zmm11
7655 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1}
7656 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
7657 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm9, %zmm10
7658 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm9
7659 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
7660 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
7661 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
7662 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7663 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
7664 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm10, %zmm11
7665 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
7666 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm10, %zmm12
7667 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1}
7668 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11
7669 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm11
7670 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm10
7671 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
7672 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10
7673 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
7674 ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7675 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12
7676 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm11, %zmm12
7677 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
7678 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm11, %zmm13
7679 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1}
7680 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12
7681 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm11, %zmm12
7682 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm11
7683 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
7684 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11
7685 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
7686 ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7687 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
7688 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm12, %zmm13
7689 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14
7690 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm12, %zmm14
7691 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
7692 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13
7693 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm12, %zmm13
7694 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm12
7695 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
7696 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12
7697 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
7698 ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7699 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14
7700 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm13, %zmm14
7701 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15
7702 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm13, %zmm15
7703 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
7704 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14
7705 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm13, %zmm14
7706 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm13
7707 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
7708 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
7709 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
7710 ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7711 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15
7712 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm14, %zmm15
7713 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm16
7714 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm14, %zmm16
7715 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1}
7716 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15
7717 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm14, %zmm15
7718 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm14
7719 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
7720 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14
7721 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
7722 ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7723 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm15, %zmm6
7724 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm15, %zmm3
7725 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
7726 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm15, %zmm1
7727 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm15, %zmm0
7728 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7729 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
7730 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi)
7731 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdx)
7732 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx)
7733 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%r8)
7734 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%r9)
7735 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%r11)
7736 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%r10)
7737 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
7738 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
7739 ; AVX512DQ-BW-FCP-NEXT: retq
7740 %wide.vec = load <256 x i16>, ptr %in.vec, align 64
7741 %strided.vec0 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120, i32 128, i32 136, i32 144, i32 152, i32 160, i32 168, i32 176, i32 184, i32 192, i32 200, i32 208, i32 216, i32 224, i32 232, i32 240, i32 248>
7742 %strided.vec1 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <32 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121, i32 129, i32 137, i32 145, i32 153, i32 161, i32 169, i32 177, i32 185, i32 193, i32 201, i32 209, i32 217, i32 225, i32 233, i32 241, i32 249>
7743 %strided.vec2 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <32 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 66, i32 74, i32 82, i32 90, i32 98, i32 106, i32 114, i32 122, i32 130, i32 138, i32 146, i32 154, i32 162, i32 170, i32 178, i32 186, i32 194, i32 202, i32 210, i32 218, i32 226, i32 234, i32 242, i32 250>
7744 %strided.vec3 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <32 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 67, i32 75, i32 83, i32 91, i32 99, i32 107, i32 115, i32 123, i32 131, i32 139, i32 147, i32 155, i32 163, i32 171, i32 179, i32 187, i32 195, i32 203, i32 211, i32 219, i32 227, i32 235, i32 243, i32 251>
7745 %strided.vec4 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <32 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 68, i32 76, i32 84, i32 92, i32 100, i32 108, i32 116, i32 124, i32 132, i32 140, i32 148, i32 156, i32 164, i32 172, i32 180, i32 188, i32 196, i32 204, i32 212, i32 220, i32 228, i32 236, i32 244, i32 252>
7746 %strided.vec5 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <32 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125, i32 133, i32 141, i32 149, i32 157, i32 165, i32 173, i32 181, i32 189, i32 197, i32 205, i32 213, i32 221, i32 229, i32 237, i32 245, i32 253>
7747 %strided.vec6 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <32 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126, i32 134, i32 142, i32 150, i32 158, i32 166, i32 174, i32 182, i32 190, i32 198, i32 206, i32 214, i32 222, i32 230, i32 238, i32 246, i32 254>
7748 %strided.vec7 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <32 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127, i32 135, i32 143, i32 151, i32 159, i32 167, i32 175, i32 183, i32 191, i32 199, i32 207, i32 215, i32 223, i32 231, i32 239, i32 247, i32 255>
7749 store <32 x i16> %strided.vec0, ptr %out.vec0, align 64
7750 store <32 x i16> %strided.vec1, ptr %out.vec1, align 64
7751 store <32 x i16> %strided.vec2, ptr %out.vec2, align 64
7752 store <32 x i16> %strided.vec3, ptr %out.vec3, align 64
7753 store <32 x i16> %strided.vec4, ptr %out.vec4, align 64
7754 store <32 x i16> %strided.vec5, ptr %out.vec5, align 64
7755 store <32 x i16> %strided.vec6, ptr %out.vec6, align 64
7756 store <32 x i16> %strided.vec7, ptr %out.vec7, align 64
7760 define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
7761 ; SSE-LABEL: load_i16_stride8_vf64:
7763 ; SSE-NEXT: subq $1800, %rsp # imm = 0x708
7764 ; SSE-NEXT: movdqa 752(%rdi), %xmm2
7765 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7766 ; SSE-NEXT: movdqa 736(%rdi), %xmm3
7767 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7768 ; SSE-NEXT: movdqa 208(%rdi), %xmm1
7769 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7770 ; SSE-NEXT: movdqa 192(%rdi), %xmm4
7771 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7772 ; SSE-NEXT: movdqa 240(%rdi), %xmm5
7773 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7774 ; SSE-NEXT: movdqa 224(%rdi), %xmm6
7775 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7776 ; SSE-NEXT: movdqa 144(%rdi), %xmm7
7777 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7778 ; SSE-NEXT: movdqa 128(%rdi), %xmm8
7779 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7780 ; SSE-NEXT: movdqa 176(%rdi), %xmm9
7781 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7782 ; SSE-NEXT: movdqa 160(%rdi), %xmm0
7783 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7784 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
7785 ; SSE-NEXT: movdqa %xmm0, %xmm9
7786 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
7787 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
7788 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0]
7789 ; SSE-NEXT: movdqa %xmm6, %xmm12
7790 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
7791 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7792 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0]
7793 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7794 ; SSE-NEXT: movdqa %xmm8, %xmm0
7795 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7796 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
7797 ; SSE-NEXT: movdqa %xmm9, %xmm7
7798 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7799 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7800 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
7801 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7802 ; SSE-NEXT: movdqa 720(%rdi), %xmm0
7803 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7804 ; SSE-NEXT: movdqa 704(%rdi), %xmm1
7805 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7806 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7807 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7808 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
7809 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
7810 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7811 ; SSE-NEXT: movdqa 688(%rdi), %xmm2
7812 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7813 ; SSE-NEXT: movdqa 672(%rdi), %xmm3
7814 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7815 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
7816 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7817 ; SSE-NEXT: movdqa 656(%rdi), %xmm2
7818 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7819 ; SSE-NEXT: movdqa 640(%rdi), %xmm0
7820 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7821 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
7822 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7823 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7824 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7825 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7826 ; SSE-NEXT: movdqa 624(%rdi), %xmm0
7827 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7828 ; SSE-NEXT: movdqa 608(%rdi), %xmm2
7829 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7830 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
7831 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7832 ; SSE-NEXT: movdqa 592(%rdi), %xmm0
7833 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7834 ; SSE-NEXT: movdqa 576(%rdi), %xmm1
7835 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7836 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7837 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7838 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0]
7839 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
7840 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7841 ; SSE-NEXT: movdqa 560(%rdi), %xmm2
7842 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7843 ; SSE-NEXT: movdqa 544(%rdi), %xmm3
7844 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7845 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
7846 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7847 ; SSE-NEXT: movdqa 528(%rdi), %xmm2
7848 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7849 ; SSE-NEXT: movdqa 512(%rdi), %xmm0
7850 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7851 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
7852 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7853 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7854 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7855 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7856 ; SSE-NEXT: movdqa 496(%rdi), %xmm0
7857 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7858 ; SSE-NEXT: movdqa 480(%rdi), %xmm1
7859 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7860 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7861 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7862 ; SSE-NEXT: movdqa 464(%rdi), %xmm0
7863 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7864 ; SSE-NEXT: movdqa 448(%rdi), %xmm10
7865 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7866 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
7867 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7868 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
7869 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0]
7870 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7871 ; SSE-NEXT: movdqa 432(%rdi), %xmm2
7872 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7873 ; SSE-NEXT: movdqa 416(%rdi), %xmm3
7874 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7875 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
7876 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7877 ; SSE-NEXT: movdqa 400(%rdi), %xmm2
7878 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7879 ; SSE-NEXT: movdqa 384(%rdi), %xmm0
7880 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7881 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
7882 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7883 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7884 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7885 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7886 ; SSE-NEXT: movdqa 1008(%rdi), %xmm0
7887 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7888 ; SSE-NEXT: movdqa 992(%rdi), %xmm1
7889 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7890 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7891 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7892 ; SSE-NEXT: movdqa 976(%rdi), %xmm0
7893 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7894 ; SSE-NEXT: movdqa 960(%rdi), %xmm13
7895 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7896 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
7897 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7898 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
7899 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,0,0]
7900 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7901 ; SSE-NEXT: movdqa 944(%rdi), %xmm2
7902 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7903 ; SSE-NEXT: movdqa 928(%rdi), %xmm3
7904 ; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill
7905 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
7906 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7907 ; SSE-NEXT: movdqa 912(%rdi), %xmm2
7908 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7909 ; SSE-NEXT: movdqa 896(%rdi), %xmm0
7910 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7911 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
7912 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7913 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7914 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7915 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7916 ; SSE-NEXT: movdqa 368(%rdi), %xmm0
7917 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7918 ; SSE-NEXT: movdqa 352(%rdi), %xmm1
7919 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7920 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7921 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7922 ; SSE-NEXT: movdqa 336(%rdi), %xmm0
7923 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7924 ; SSE-NEXT: movdqa 320(%rdi), %xmm10
7925 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7926 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
7927 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7928 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
7929 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0]
7930 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7931 ; SSE-NEXT: movdqa 304(%rdi), %xmm2
7932 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7933 ; SSE-NEXT: movdqa 288(%rdi), %xmm3
7934 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7935 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
7936 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7937 ; SSE-NEXT: movdqa 272(%rdi), %xmm0
7938 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7939 ; SSE-NEXT: movdqa 256(%rdi), %xmm14
7940 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7941 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
7942 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7943 ; SSE-NEXT: movdqa %xmm14, %xmm0
7944 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7945 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7946 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7947 ; SSE-NEXT: movdqa 880(%rdi), %xmm0
7948 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7949 ; SSE-NEXT: movdqa 864(%rdi), %xmm1
7950 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7951 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7952 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7953 ; SSE-NEXT: movdqa 848(%rdi), %xmm0
7954 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7955 ; SSE-NEXT: movdqa 832(%rdi), %xmm9
7956 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7957 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
7958 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7959 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
7960 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,0,0]
7961 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7962 ; SSE-NEXT: movdqa 816(%rdi), %xmm2
7963 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7964 ; SSE-NEXT: movdqa 800(%rdi), %xmm3
7965 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7966 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
7967 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7968 ; SSE-NEXT: movdqa 784(%rdi), %xmm0
7969 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7970 ; SSE-NEXT: movdqa 768(%rdi), %xmm15
7971 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7972 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
7973 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7974 ; SSE-NEXT: movdqa %xmm15, %xmm0
7975 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7976 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7977 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7978 ; SSE-NEXT: movdqa 112(%rdi), %xmm0
7979 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7980 ; SSE-NEXT: movdqa 96(%rdi), %xmm13
7981 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7982 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
7983 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7984 ; SSE-NEXT: movdqa 80(%rdi), %xmm0
7985 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7986 ; SSE-NEXT: movdqa 64(%rdi), %xmm5
7987 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7988 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
7989 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7990 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,0,0,0]
7991 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0]
7992 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
7993 ; SSE-NEXT: movdqa 32(%rdi), %xmm2
7994 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7995 ; SSE-NEXT: movdqa 48(%rdi), %xmm1
7996 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7997 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
7998 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7999 ; SSE-NEXT: movdqa (%rdi), %xmm6
8000 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8001 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
8002 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8003 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
8004 ; SSE-NEXT: movdqa %xmm6, %xmm1
8005 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8006 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
8007 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8008 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8009 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1]
8010 ; SSE-NEXT: movdqa %xmm7, %xmm14
8011 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
8012 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8013 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8014 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8015 ; SSE-NEXT: movdqa %xmm7, %xmm0
8016 ; SSE-NEXT: movdqa %xmm12, %xmm4
8017 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8018 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
8019 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8020 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8021 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8022 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1]
8023 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
8024 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1]
8025 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8026 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8027 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8028 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
8029 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8030 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8031 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8032 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1]
8033 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
8034 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1]
8035 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8036 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8037 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8038 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1]
8039 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8040 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8041 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8042 ; SSE-NEXT: # xmm1 = mem[1,1,1,1]
8043 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8044 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
8045 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8046 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8047 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8048 ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
8049 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8050 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8051 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8052 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1]
8053 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8054 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
8055 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8056 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8057 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8058 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
8059 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8060 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8061 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8062 ; SSE-NEXT: # xmm1 = mem[1,1,1,1]
8063 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8064 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
8065 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8066 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8067 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8068 ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
8069 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8070 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8071 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8072 ; SSE-NEXT: # xmm1 = mem[1,1,1,1]
8073 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8074 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
8075 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8076 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8077 ; SSE-NEXT: movaps %xmm5, %xmm0
8078 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8079 ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
8080 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8081 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8082 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1]
8083 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8084 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
8085 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8086 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8087 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8088 ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
8089 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8090 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8091 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2]
8092 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2]
8093 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8094 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8095 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
8096 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8097 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8098 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
8099 ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8100 ; SSE-NEXT: # xmm1 = mem[2,2,2,2]
8101 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8102 ; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3]
8103 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1]
8104 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8105 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2]
8106 ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8107 ; SSE-NEXT: # xmm1 = mem[2,2,2,2]
8108 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8109 ; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm11[2],xmm10[3],xmm11[3]
8110 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1]
8111 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8112 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8113 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2]
8114 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
8115 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,2,2]
8116 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8117 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8118 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8119 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
8120 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8121 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8122 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2]
8123 ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8124 ; SSE-NEXT: # xmm1 = mem[2,2,2,2]
8125 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8126 ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
8127 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1]
8128 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8129 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
8130 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2]
8131 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8132 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,2,2]
8133 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8134 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8135 ; SSE-NEXT: movapd %xmm4, %xmm0
8136 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8137 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
8138 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8139 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8140 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
8141 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2]
8142 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,2,2]
8143 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8144 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8145 ; SSE-NEXT: movapd %xmm7, %xmm0
8146 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8147 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3]
8148 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8149 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8150 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8151 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,2,2]
8152 ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8153 ; SSE-NEXT: # xmm0 = mem[2,2,2,2]
8154 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
8155 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8156 ; SSE-NEXT: movdqa %xmm5, %xmm1
8157 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
8158 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8159 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8160 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8161 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8162 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
8163 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8164 ; SSE-NEXT: # xmm1 = mem[3,3,3,3]
8165 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8166 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
8167 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8168 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
8169 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8170 ; SSE-NEXT: movdqa %xmm15, %xmm2
8171 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm13[2],xmm2[3],xmm13[3]
8172 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3]
8173 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3]
8174 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8175 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
8176 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8177 ; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm10[2],xmm14[3],xmm10[3]
8178 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8179 ; SSE-NEXT: # xmm1 = mem[3,3,3,3]
8180 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3]
8181 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8182 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
8183 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8184 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8185 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8186 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
8187 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8188 ; SSE-NEXT: # xmm1 = mem[3,3,3,3]
8189 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8190 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
8191 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8192 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
8193 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8194 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8195 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
8196 ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
8197 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8198 ; SSE-NEXT: # xmm1 = mem[3,3,3,3]
8199 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8200 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
8201 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8202 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
8203 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8204 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8205 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm11[2],xmm2[3],xmm11[3]
8206 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3]
8207 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3]
8208 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8209 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
8210 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8211 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8212 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8213 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
8214 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8215 ; SSE-NEXT: # xmm1 = mem[3,3,3,3]
8216 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8217 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
8218 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8219 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
8220 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8221 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8222 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3]
8223 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3]
8224 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3]
8225 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8226 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
8227 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8228 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8229 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
8230 ; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
8231 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
8232 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
8233 ; SSE-NEXT: # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7]
8234 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8235 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8236 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8237 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8238 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8239 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8240 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
8241 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8242 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8243 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
8244 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8245 ; SSE-NEXT: movdqa %xmm11, %xmm0
8246 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
8247 ; SSE-NEXT: movdqa %xmm6, %xmm13
8248 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8249 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8250 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
8251 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
8252 ; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7]
8253 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8254 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8255 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8256 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8257 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8258 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8259 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8260 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8261 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8262 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8263 ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
8264 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8265 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8266 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
8267 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
8268 ; SSE-NEXT: movdqa %xmm2, %xmm0
8269 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1]
8270 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
8271 ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8272 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8273 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
8274 ; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7]
8275 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8276 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
8277 ; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
8278 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8279 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8280 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8281 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8282 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8283 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8284 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
8285 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8286 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8287 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
8288 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8289 ; SSE-NEXT: movdqa %xmm6, %xmm0
8290 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
8291 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8292 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8293 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8294 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
8295 ; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7]
8296 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8297 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8298 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8299 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8300 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8301 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8302 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8303 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8304 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8305 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8306 ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
8307 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8308 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8309 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,0,0,0]
8310 ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3]
8311 ; SSE-NEXT: movdqa %xmm2, %xmm0
8312 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
8313 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1]
8314 ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8315 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8316 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
8317 ; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
8318 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8319 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
8320 ; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
8321 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8322 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8323 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8324 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8325 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8326 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8327 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8328 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8329 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8330 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,0,0,0]
8331 ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3]
8332 ; SSE-NEXT: movdqa %xmm5, %xmm0
8333 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8334 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
8335 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8336 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1]
8337 ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8338 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8339 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8340 ; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7]
8341 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8342 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8343 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8344 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8345 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8346 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8347 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
8348 ; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7]
8349 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8350 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8351 ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
8352 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8353 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,0,0]
8354 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8355 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,0,0]
8356 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8357 ; SSE-NEXT: movdqa %xmm2, %xmm0
8358 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
8359 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8360 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8361 ; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload
8362 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8363 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8364 ; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill
8365 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8366 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8367 ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
8368 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8369 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8370 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8371 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8372 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8373 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8374 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8375 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
8376 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8377 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8378 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
8379 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8380 ; SSE-NEXT: movdqa %xmm3, %xmm0
8381 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
8382 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8383 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8384 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8385 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8386 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8387 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8388 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8389 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8390 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
8391 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8392 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8393 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
8394 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8395 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8396 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8397 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8398 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8399 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8400 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8401 ; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7]
8402 ; SSE-NEXT: movdqa %xmm7, %xmm2
8403 ; SSE-NEXT: movdqa %xmm7, %xmm3
8404 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8405 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
8406 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
8407 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8408 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8409 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1]
8410 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8411 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1]
8412 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8413 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
8414 ; SSE-NEXT: movaps %xmm13, %xmm0
8415 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
8416 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
8417 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8418 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8419 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8420 ; SSE-NEXT: # xmm2 = mem[1,1,1,1]
8421 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8422 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1]
8423 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
8424 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8425 ; SSE-NEXT: movaps %xmm7, %xmm0
8426 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
8427 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1]
8428 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
8429 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8430 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8431 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1]
8432 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8433 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
8434 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8435 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8436 ; SSE-NEXT: movaps %xmm10, %xmm0
8437 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8438 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
8439 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8440 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8441 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8442 ; SSE-NEXT: # xmm1 = mem[1,1,1,1]
8443 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8444 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
8445 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8446 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8447 ; SSE-NEXT: movaps %xmm8, %xmm0
8448 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8449 ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
8450 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8451 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8452 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1]
8453 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
8454 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8455 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8456 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8457 ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
8458 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8459 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8460 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8461 ; SSE-NEXT: # xmm1 = mem[1,1,1,1]
8462 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8463 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1]
8464 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8465 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8466 ; SSE-NEXT: movdqa %xmm6, %xmm0
8467 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
8468 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8469 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8470 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8471 ; SSE-NEXT: # xmm1 = mem[1,1,1,1]
8472 ; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload
8473 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
8474 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8475 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8476 ; SSE-NEXT: movaps %xmm5, %xmm0
8477 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8478 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
8479 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8480 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8481 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
8482 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8483 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1]
8484 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8485 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8486 ; SSE-NEXT: movaps %xmm3, %xmm0
8487 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8488 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
8489 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8490 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8491 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2]
8492 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm13[2,2,2,2]
8493 ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3]
8494 ; SSE-NEXT: movdqa %xmm11, %xmm1
8495 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8496 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8497 ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
8498 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8499 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8500 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
8501 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2]
8502 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8503 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8504 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8505 ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
8506 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8507 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8508 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2]
8509 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,2,2]
8510 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8511 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8512 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8513 ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
8514 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8515 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8516 ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8517 ; SSE-NEXT: # xmm0 = mem[2,2,2,2]
8518 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,2,2]
8519 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8520 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8521 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8522 ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
8523 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8524 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8525 ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8526 ; SSE-NEXT: # xmm0 = mem[2,2,2,2]
8527 ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
8528 ; SSE-NEXT: # xmm11 = mem[2,2,2,2]
8529 ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3]
8530 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
8531 ; SSE-NEXT: movapd %xmm14, %xmm0
8532 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8533 ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
8534 ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1]
8535 ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8536 ; SSE-NEXT: # xmm0 = mem[2,2,2,2]
8537 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[2,2,2,2]
8538 ; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3]
8539 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8540 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3]
8541 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1]
8542 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2]
8543 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,2,2,2]
8544 ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3]
8545 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8546 ; SSE-NEXT: unpckhps (%rsp), %xmm0 # 16-byte Folded Reload
8547 ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
8548 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1]
8549 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2]
8550 ; SSE-NEXT: movaps %xmm2, %xmm15
8551 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,2,2]
8552 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8553 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
8554 ; SSE-NEXT: movdqa %xmm13, %xmm7
8555 ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3]
8556 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3]
8557 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8558 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8559 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
8560 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
8561 ; SSE-NEXT: # xmm8 = mem[3,3,3,3]
8562 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8563 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
8564 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
8565 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3]
8566 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8567 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8568 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
8569 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
8570 ; SSE-NEXT: # xmm6 = mem[3,3,3,3]
8571 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8572 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
8573 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
8574 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,3]
8575 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8576 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8577 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
8578 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
8579 ; SSE-NEXT: # xmm5 = mem[3,3,3,3]
8580 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8581 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
8582 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
8583 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,3]
8584 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8585 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8586 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
8587 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
8588 ; SSE-NEXT: # xmm4 = mem[3,3,3,3]
8589 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8590 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
8591 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
8592 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3]
8593 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8594 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8595 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
8596 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[3,3,3,3]
8597 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8598 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
8599 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
8600 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
8601 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8602 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8603 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
8604 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8605 ; SSE-NEXT: # xmm2 = mem[3,3,3,3]
8606 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8607 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
8608 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
8609 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
8610 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8611 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
8612 ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
8613 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8614 ; SSE-NEXT: # xmm1 = mem[3,3,3,3]
8615 ; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload
8616 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
8617 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8618 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3]
8619 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8620 ; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm15[2],xmm12[3],xmm15[3]
8621 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3]
8622 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
8623 ; SSE-NEXT: # xmm15 = mem[3,3,3,3]
8624 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1]
8625 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3]
8626 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8627 ; SSE-NEXT: movaps %xmm15, 96(%rsi)
8628 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8629 ; SSE-NEXT: movaps %xmm15, 32(%rsi)
8630 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8631 ; SSE-NEXT: movaps %xmm15, 112(%rsi)
8632 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8633 ; SSE-NEXT: movaps %xmm15, 48(%rsi)
8634 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8635 ; SSE-NEXT: movaps %xmm15, 64(%rsi)
8636 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8637 ; SSE-NEXT: movaps %xmm15, (%rsi)
8638 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8639 ; SSE-NEXT: movaps %xmm15, 80(%rsi)
8640 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8641 ; SSE-NEXT: movaps %xmm15, 16(%rsi)
8642 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8643 ; SSE-NEXT: movaps %xmm15, 96(%rdx)
8644 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8645 ; SSE-NEXT: movaps %xmm15, 32(%rdx)
8646 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8647 ; SSE-NEXT: movaps %xmm15, 112(%rdx)
8648 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8649 ; SSE-NEXT: movaps %xmm15, 48(%rdx)
8650 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8651 ; SSE-NEXT: movaps %xmm15, 64(%rdx)
8652 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8653 ; SSE-NEXT: movaps %xmm15, (%rdx)
8654 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8655 ; SSE-NEXT: movaps %xmm15, 80(%rdx)
8656 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8657 ; SSE-NEXT: movaps %xmm15, 16(%rdx)
8658 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8659 ; SSE-NEXT: movaps %xmm15, 96(%rcx)
8660 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8661 ; SSE-NEXT: movaps %xmm15, 32(%rcx)
8662 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8663 ; SSE-NEXT: movaps %xmm15, 112(%rcx)
8664 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8665 ; SSE-NEXT: movaps %xmm15, 48(%rcx)
8666 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8667 ; SSE-NEXT: movaps %xmm15, 64(%rcx)
8668 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8669 ; SSE-NEXT: movaps %xmm15, (%rcx)
8670 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8671 ; SSE-NEXT: movaps %xmm15, 80(%rcx)
8672 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8673 ; SSE-NEXT: movaps %xmm15, 16(%rcx)
8674 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8675 ; SSE-NEXT: movaps %xmm15, 112(%r8)
8676 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8677 ; SSE-NEXT: movaps %xmm15, 96(%r8)
8678 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8679 ; SSE-NEXT: movaps %xmm15, 80(%r8)
8680 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8681 ; SSE-NEXT: movaps %xmm15, 64(%r8)
8682 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8683 ; SSE-NEXT: movaps %xmm15, 48(%r8)
8684 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8685 ; SSE-NEXT: movaps %xmm15, 32(%r8)
8686 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8687 ; SSE-NEXT: movaps %xmm15, 16(%r8)
8688 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8689 ; SSE-NEXT: movaps %xmm15, (%r8)
8690 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8691 ; SSE-NEXT: movaps %xmm15, 112(%r9)
8692 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8693 ; SSE-NEXT: movaps %xmm15, 96(%r9)
8694 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8695 ; SSE-NEXT: movaps %xmm15, 80(%r9)
8696 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8697 ; SSE-NEXT: movaps %xmm15, 64(%r9)
8698 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8699 ; SSE-NEXT: movaps %xmm15, 48(%r9)
8700 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8701 ; SSE-NEXT: movaps %xmm15, 32(%r9)
8702 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8703 ; SSE-NEXT: movaps %xmm15, 16(%r9)
8704 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8705 ; SSE-NEXT: movaps %xmm15, (%r9)
8706 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
8707 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8708 ; SSE-NEXT: movaps %xmm12, 112(%rax)
8709 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8710 ; SSE-NEXT: movaps %xmm12, 96(%rax)
8711 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8712 ; SSE-NEXT: movaps %xmm12, 80(%rax)
8713 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8714 ; SSE-NEXT: movaps %xmm12, 64(%rax)
8715 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8716 ; SSE-NEXT: movaps %xmm12, 48(%rax)
8717 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8718 ; SSE-NEXT: movaps %xmm12, 32(%rax)
8719 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8720 ; SSE-NEXT: movaps %xmm15, 16(%rax)
8721 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8722 ; SSE-NEXT: movaps %xmm12, (%rax)
8723 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
8724 ; SSE-NEXT: movapd %xmm9, 112(%rax)
8725 ; SSE-NEXT: movapd %xmm10, 96(%rax)
8726 ; SSE-NEXT: movapd %xmm11, 80(%rax)
8727 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8728 ; SSE-NEXT: movaps %xmm9, 64(%rax)
8729 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8730 ; SSE-NEXT: movaps %xmm9, 48(%rax)
8731 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8732 ; SSE-NEXT: movaps %xmm9, 32(%rax)
8733 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8734 ; SSE-NEXT: movaps %xmm9, 16(%rax)
8735 ; SSE-NEXT: movaps %xmm7, (%rax)
8736 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
8737 ; SSE-NEXT: movaps %xmm1, 112(%rax)
8738 ; SSE-NEXT: movaps %xmm2, 96(%rax)
8739 ; SSE-NEXT: movaps %xmm3, 80(%rax)
8740 ; SSE-NEXT: movaps %xmm4, 64(%rax)
8741 ; SSE-NEXT: movaps %xmm5, 48(%rax)
8742 ; SSE-NEXT: movaps %xmm6, 32(%rax)
8743 ; SSE-NEXT: movaps %xmm8, 16(%rax)
8744 ; SSE-NEXT: movaps %xmm0, (%rax)
8745 ; SSE-NEXT: addq $1800, %rsp # imm = 0x708
8748 ; AVX-LABEL: load_i16_stride8_vf64:
8750 ; AVX-NEXT: subq $2056, %rsp # imm = 0x808
8751 ; AVX-NEXT: vmovdqa 304(%rdi), %xmm0
8752 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8753 ; AVX-NEXT: vmovdqa 288(%rdi), %xmm1
8754 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8755 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8756 ; AVX-NEXT: vmovdqa 272(%rdi), %xmm0
8757 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8758 ; AVX-NEXT: vmovdqa 256(%rdi), %xmm1
8759 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8760 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8761 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm14[0],xmm11[1],xmm14[1]
8762 ; AVX-NEXT: vmovdqa 368(%rdi), %xmm1
8763 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8764 ; AVX-NEXT: vmovdqa 352(%rdi), %xmm2
8765 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8766 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8767 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8768 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
8769 ; AVX-NEXT: vmovdqa 336(%rdi), %xmm2
8770 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8771 ; AVX-NEXT: vmovdqa 320(%rdi), %xmm3
8772 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8773 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8774 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,1,0,1]
8775 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
8776 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
8777 ; AVX-NEXT: vmovdqa 496(%rdi), %xmm1
8778 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8779 ; AVX-NEXT: vmovdqa 480(%rdi), %xmm2
8780 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8781 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8782 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,0,0,0]
8783 ; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8784 ; AVX-NEXT: vmovdqa 464(%rdi), %xmm2
8785 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8786 ; AVX-NEXT: vmovdqa 448(%rdi), %xmm3
8787 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8788 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8789 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,0,1]
8790 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
8791 ; AVX-NEXT: vmovdqa 432(%rdi), %xmm2
8792 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8793 ; AVX-NEXT: vmovdqa 416(%rdi), %xmm3
8794 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8795 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8796 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8797 ; AVX-NEXT: vmovdqa 400(%rdi), %xmm2
8798 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8799 ; AVX-NEXT: vmovdqa 384(%rdi), %xmm3
8800 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8801 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8802 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8803 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
8804 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
8805 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
8806 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
8807 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
8808 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8809 ; AVX-NEXT: vmovdqa 880(%rdi), %xmm0
8810 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8811 ; AVX-NEXT: vmovdqa 864(%rdi), %xmm1
8812 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8813 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8814 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8815 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8816 ; AVX-NEXT: vmovdqa 848(%rdi), %xmm1
8817 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8818 ; AVX-NEXT: vmovdqa 832(%rdi), %xmm2
8819 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8820 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8821 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8822 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
8823 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
8824 ; AVX-NEXT: vmovdqa 816(%rdi), %xmm1
8825 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8826 ; AVX-NEXT: vmovdqa 800(%rdi), %xmm2
8827 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8828 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8829 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8830 ; AVX-NEXT: vmovdqa 784(%rdi), %xmm1
8831 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8832 ; AVX-NEXT: vmovdqa 768(%rdi), %xmm2
8833 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8834 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8835 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1]
8836 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
8837 ; AVX-NEXT: vmovdqa 1008(%rdi), %xmm1
8838 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8839 ; AVX-NEXT: vmovdqa 992(%rdi), %xmm2
8840 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8841 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8842 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,0,0,0]
8843 ; AVX-NEXT: vmovdqa 976(%rdi), %xmm2
8844 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8845 ; AVX-NEXT: vmovdqa 960(%rdi), %xmm3
8846 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8847 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8848 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8849 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
8850 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
8851 ; AVX-NEXT: vmovdqa 944(%rdi), %xmm2
8852 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8853 ; AVX-NEXT: vmovdqa 928(%rdi), %xmm3
8854 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8855 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8856 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8857 ; AVX-NEXT: vmovdqa 912(%rdi), %xmm2
8858 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8859 ; AVX-NEXT: vmovdqa 896(%rdi), %xmm3
8860 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8861 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8862 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8863 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
8864 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
8865 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
8866 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
8867 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
8868 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8869 ; AVX-NEXT: vmovdqa 624(%rdi), %xmm0
8870 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8871 ; AVX-NEXT: vmovdqa 608(%rdi), %xmm1
8872 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8873 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8874 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8875 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8876 ; AVX-NEXT: vmovdqa 592(%rdi), %xmm1
8877 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8878 ; AVX-NEXT: vmovdqa 576(%rdi), %xmm2
8879 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8880 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8881 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8882 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
8883 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
8884 ; AVX-NEXT: vmovdqa 560(%rdi), %xmm1
8885 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8886 ; AVX-NEXT: vmovdqa 544(%rdi), %xmm2
8887 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8888 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8889 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8890 ; AVX-NEXT: vmovdqa 528(%rdi), %xmm1
8891 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8892 ; AVX-NEXT: vmovdqa 512(%rdi), %xmm2
8893 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8894 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8895 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8896 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
8897 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
8898 ; AVX-NEXT: vmovdqa 752(%rdi), %xmm1
8899 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8900 ; AVX-NEXT: vmovdqa 736(%rdi), %xmm2
8901 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8902 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8903 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8904 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
8905 ; AVX-NEXT: vmovdqa 720(%rdi), %xmm2
8906 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8907 ; AVX-NEXT: vmovdqa 704(%rdi), %xmm3
8908 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8909 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8910 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8911 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
8912 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
8913 ; AVX-NEXT: vmovdqa 688(%rdi), %xmm2
8914 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8915 ; AVX-NEXT: vmovdqa 672(%rdi), %xmm3
8916 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8917 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8918 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8919 ; AVX-NEXT: vmovdqa 656(%rdi), %xmm2
8920 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8921 ; AVX-NEXT: vmovdqa 640(%rdi), %xmm3
8922 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8923 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8924 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8925 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
8926 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
8927 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
8928 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
8929 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
8930 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8931 ; AVX-NEXT: vmovdqa 240(%rdi), %xmm0
8932 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8933 ; AVX-NEXT: vmovdqa 224(%rdi), %xmm1
8934 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8935 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8936 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8937 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8938 ; AVX-NEXT: vmovdqa 208(%rdi), %xmm1
8939 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8940 ; AVX-NEXT: vmovdqa 192(%rdi), %xmm2
8941 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8942 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8943 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1]
8944 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8945 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
8946 ; AVX-NEXT: vmovdqa 176(%rdi), %xmm1
8947 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8948 ; AVX-NEXT: vmovdqa 160(%rdi), %xmm2
8949 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8950 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8951 ; AVX-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill
8952 ; AVX-NEXT: vmovdqa 144(%rdi), %xmm1
8953 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8954 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm2
8955 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8956 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8957 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8958 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
8959 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
8960 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
8961 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7]
8962 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm0
8963 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8964 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm1
8965 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8966 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8967 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8968 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
8969 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm0
8970 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8971 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm2
8972 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8973 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
8974 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8975 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
8976 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm1[6,7]
8977 ; AVX-NEXT: vmovdqa (%rdi), %xmm1
8978 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8979 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm2
8980 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8981 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm6
8982 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8983 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm7
8984 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8985 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
8986 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8987 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8988 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
8989 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8990 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4,5,6,7]
8991 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
8992 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8993 ; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8994 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,1,1,1]
8995 ; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8996 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3],xmm0[4,5,6,7]
8997 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8998 ; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8999 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm13[0],xmm5[0],xmm13[1],xmm5[1]
9000 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
9001 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9002 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
9003 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9004 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9005 ; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm10[1,1,1,1]
9006 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9007 ; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm9[1],xmm15[2,3]
9008 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
9009 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7]
9010 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
9011 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9012 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9013 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
9014 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9015 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7]
9016 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9017 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9018 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
9019 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
9020 ; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9021 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9022 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm12[0],xmm1[1],xmm12[1]
9023 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9024 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9025 ; AVX-NEXT: # xmm15 = mem[1,1,1,1]
9026 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
9027 ; AVX-NEXT: # xmm15 = xmm15[0],mem[1],xmm15[2,3]
9028 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
9029 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7]
9030 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
9031 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9032 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9033 ; AVX-NEXT: # xmm0 = mem[1,1,1,1]
9034 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9035 ; AVX-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
9036 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9037 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
9038 ; AVX-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1]
9039 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
9040 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9041 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
9042 ; AVX-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1]
9043 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9044 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9045 ; AVX-NEXT: # xmm15 = mem[1,1,1,1]
9046 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
9047 ; AVX-NEXT: # xmm15 = xmm15[0],mem[1],xmm15[2,3]
9048 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
9049 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7]
9050 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
9051 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9052 ; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload
9053 ; AVX-NEXT: # xmm0 = xmm4[0],mem[0],xmm4[1],mem[1]
9054 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
9055 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9056 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm4[1,1,1,1]
9057 ; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
9058 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
9059 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9060 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
9061 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
9062 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
9063 ; AVX-NEXT: # xmm3 = xmm3[0,1],mem[2,3],xmm3[4,5,6,7]
9064 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9065 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9066 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1]
9067 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm15[4,5,6,7]
9068 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
9069 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9070 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
9071 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5],xmm0[6,7]
9072 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm11[2],xmm14[2],xmm11[3],xmm14[3]
9073 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
9074 ; AVX-NEXT: vunpckhps {{.*#+}} xmm3 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
9075 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9076 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9077 ; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm9[2,2,2,2]
9078 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9079 ; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm14[0,1,2],xmm15[3]
9080 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
9081 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7]
9082 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
9083 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9084 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,2,2,2]
9085 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5],xmm0[6,7]
9086 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9087 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
9088 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
9089 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9090 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9091 ; AVX-NEXT: vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm11[2],xmm13[3],xmm11[3]
9092 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9093 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[2,2,2,2]
9094 ; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
9095 ; AVX-NEXT: # xmm15 = mem[0,1,2,3,4,5],xmm15[6,7]
9096 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
9097 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7]
9098 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
9099 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9100 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9101 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,2,2,2]
9102 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9103 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
9104 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9105 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9106 ; AVX-NEXT: vunpckhps {{.*#+}} xmm3 = xmm10[2],xmm5[2],xmm10[3],xmm5[3]
9107 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
9108 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9109 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
9110 ; AVX-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
9111 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9112 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9113 ; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm12[2,2,2,2]
9114 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9115 ; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1,2],xmm15[3]
9116 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
9117 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7]
9118 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
9119 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9120 ; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
9121 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
9122 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9123 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2]
9124 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9125 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3]
9126 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9127 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
9128 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,2,2]
9129 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9130 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5],xmm3[6,7]
9131 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9132 ; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
9133 ; AVX-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
9134 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm3[4,5,6,7]
9135 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
9136 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9137 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9138 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9139 ; AVX-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9140 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9141 ; AVX-NEXT: # xmm3 = mem[2,3,2,3]
9142 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9143 ; AVX-NEXT: # xmm15 = mem[3,3,3,3]
9144 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3]
9145 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
9146 ; AVX-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm9[2],xmm14[3],xmm9[3]
9147 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9148 ; AVX-NEXT: # xmm15 = mem[2,3,2,3]
9149 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9150 ; AVX-NEXT: # xmm14 = mem[3,3,3,3]
9151 ; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3]
9152 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9153 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
9154 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5],ymm3[6,7]
9155 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
9156 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9157 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9158 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9159 ; AVX-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9160 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9161 ; AVX-NEXT: # xmm3 = mem[2,3,2,3]
9162 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9163 ; AVX-NEXT: # xmm14 = mem[3,3,3,3]
9164 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0],xmm3[1],xmm14[2,3]
9165 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
9166 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9167 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
9168 ; AVX-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
9169 ; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm11[2,3,2,3]
9170 ; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm13[3,3,3,3]
9171 ; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3]
9172 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9173 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
9174 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5],ymm3[6,7]
9175 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
9176 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9177 ; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
9178 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm5[2,3,2,3]
9179 ; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm10[3,3,3,3]
9180 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0],xmm3[1],xmm14[2,3]
9181 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
9182 ; AVX-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm12[2],xmm8[3],xmm12[3]
9183 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
9184 ; AVX-NEXT: # xmm11 = mem[2,3,2,3]
9185 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
9186 ; AVX-NEXT: # xmm10 = mem[3,3,3,3]
9187 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3]
9188 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9189 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
9190 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7]
9191 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
9192 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9193 ; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
9194 ; AVX-NEXT: vpermilps $238, (%rsp), %xmm3 # 16-byte Folded Reload
9195 ; AVX-NEXT: # xmm3 = mem[2,3,2,3]
9196 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9197 ; AVX-NEXT: # xmm6 = mem[3,3,3,3]
9198 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3]
9199 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
9200 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9201 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
9202 ; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
9203 ; AVX-NEXT: # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3]
9204 ; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9205 ; AVX-NEXT: # xmm2 = mem[2,3,2,3]
9206 ; AVX-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9207 ; AVX-NEXT: # xmm1 = mem[3,3,3,3]
9208 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
9209 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
9210 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
9211 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9212 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9213 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
9214 ; AVX-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9215 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9216 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
9217 ; AVX-NEXT: # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9218 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9219 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
9220 ; AVX-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9221 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9222 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
9223 ; AVX-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9224 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,0,0,0]
9225 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1]
9226 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
9227 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
9228 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9229 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
9230 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9231 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
9232 ; AVX-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9233 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9234 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9235 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload
9236 ; AVX-NEXT: # xmm15 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9237 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9238 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload
9239 ; AVX-NEXT: # xmm11 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9240 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9241 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
9242 ; AVX-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9243 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9244 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[0,0,0,0]
9245 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1]
9246 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
9247 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm3[0],xmm15[1],xmm3[1]
9248 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
9249 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9250 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
9251 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9252 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9253 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9254 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
9255 ; AVX-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9256 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9257 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9258 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
9259 ; AVX-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9260 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9261 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
9262 ; AVX-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9263 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9264 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9265 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
9266 ; AVX-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9267 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9268 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
9269 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1]
9270 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
9271 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
9272 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9273 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
9274 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9275 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
9276 ; AVX-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9277 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9278 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9279 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
9280 ; AVX-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9281 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9282 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9283 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
9284 ; AVX-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9285 ; AVX-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
9286 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9287 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
9288 ; AVX-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9289 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9290 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
9291 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1]
9292 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
9293 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
9294 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
9295 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9296 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
9297 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9298 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9299 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9300 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
9301 ; AVX-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9302 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9303 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9304 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
9305 ; AVX-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9306 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9307 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9308 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
9309 ; AVX-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9310 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9311 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9312 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
9313 ; AVX-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9314 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9315 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
9316 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1]
9317 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
9318 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
9319 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5,6,7]
9320 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9321 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
9322 ; AVX-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9323 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9324 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9325 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
9326 ; AVX-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9327 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9328 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9329 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload
9330 ; AVX-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9331 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9332 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9333 ; AVX-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9334 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9335 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,0,0,0]
9336 ; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9337 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,1]
9338 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,5],xmm3[6,7]
9339 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
9340 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
9341 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9342 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
9343 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7]
9344 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9345 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9346 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
9347 ; AVX-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9348 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9349 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9350 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
9351 ; AVX-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9352 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9353 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
9354 ; AVX-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9355 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9356 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9357 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9358 ; AVX-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9359 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9360 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
9361 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
9362 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6,7]
9363 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm1[0],xmm12[1],xmm1[1]
9364 ; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9365 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9366 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
9367 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6,7]
9368 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9369 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
9370 ; AVX-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9371 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9372 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9373 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9374 ; AVX-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9375 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9376 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
9377 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
9378 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5],xmm2[6,7]
9379 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9380 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
9381 ; AVX-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9382 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9383 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9384 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
9385 ; AVX-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
9386 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9387 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
9388 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3],xmm0[4,5,6,7]
9389 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9390 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9391 ; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9392 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
9393 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7]
9394 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9395 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9396 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
9397 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
9398 ; AVX-NEXT: vmovdqa %xmm11, %xmm8
9399 ; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9400 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9401 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
9402 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9403 ; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9404 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[1,1,1,1]
9405 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9406 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm3[2,3],xmm14[4,5,6,7]
9407 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
9408 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7]
9409 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9410 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9411 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
9412 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9413 ; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7]
9414 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9415 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9416 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
9417 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
9418 ; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload
9419 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9420 ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
9421 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9422 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9423 ; AVX-NEXT: # xmm14 = mem[1,1,1,1]
9424 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9425 ; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm2[1],xmm14[2,3]
9426 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
9427 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7]
9428 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9429 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9430 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9431 ; AVX-NEXT: # xmm0 = mem[1,1,1,1]
9432 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9433 ; AVX-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
9434 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9435 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
9436 ; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
9437 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
9438 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9439 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1]
9440 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9441 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9442 ; AVX-NEXT: # xmm14 = mem[1,1,1,1]
9443 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
9444 ; AVX-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3]
9445 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
9446 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7]
9447 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9448 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9449 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9450 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9451 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
9452 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
9453 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[1,1,1,1]
9454 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9455 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3],xmm1[4,5,6,7]
9456 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9457 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
9458 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9459 ; AVX-NEXT: # xmm1 = mem[1,1,1,1]
9460 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
9461 ; AVX-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3]
9462 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9463 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
9464 ; AVX-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1]
9465 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
9466 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
9467 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9468 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[2,2,2,2]
9469 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5],xmm0[6,7]
9470 ; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload
9471 ; AVX-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3]
9472 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
9473 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm15[2],xmm3[2],xmm15[3],xmm3[3]
9474 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9475 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[2,2,2,2]
9476 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm11[0,1,2,3,4,5],xmm14[6,7]
9477 ; AVX-NEXT: vmovdqa %xmm11, %xmm15
9478 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
9479 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7]
9480 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9481 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9482 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
9483 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5],xmm0[6,7]
9484 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9485 ; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
9486 ; AVX-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
9487 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
9488 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9489 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
9490 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9491 ; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm6[2,2,2,2]
9492 ; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
9493 ; AVX-NEXT: # xmm14 = mem[0,1,2],xmm14[3]
9494 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
9495 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7]
9496 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9497 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9498 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9499 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,2,2,2]
9500 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9501 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
9502 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9503 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9504 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm4[2],xmm8[3],xmm4[3]
9505 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
9506 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9507 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9508 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
9509 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9510 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9511 ; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm6[2,2,2,2]
9512 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9513 ; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0,1,2],xmm14[3]
9514 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
9515 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7]
9516 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9517 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9518 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9519 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
9520 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
9521 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,2,2,2]
9522 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9523 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3]
9524 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9525 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7]
9526 ; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9527 ; AVX-NEXT: # xmm1 = mem[2,2,2,2]
9528 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9529 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3]
9530 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9531 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9532 ; AVX-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
9533 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
9534 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7]
9535 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9536 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9537 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9538 ; AVX-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9539 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9540 ; AVX-NEXT: # xmm1 = mem[2,3,2,3]
9541 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9542 ; AVX-NEXT: # xmm14 = mem[3,3,3,3]
9543 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3]
9544 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
9545 ; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload
9546 ; AVX-NEXT: # xmm1 = xmm15[2],mem[2],xmm15[3],mem[3]
9547 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9548 ; AVX-NEXT: # xmm14 = mem[2,3,2,3]
9549 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9550 ; AVX-NEXT: # xmm15 = mem[3,3,3,3]
9551 ; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3]
9552 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9553 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
9554 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7]
9555 ; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9556 ; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm7[2],xmm5[3],xmm7[3]
9557 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3]
9558 ; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm8[3,3,3,3]
9559 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3]
9560 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
9561 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm6[2],xmm3[3],xmm6[3]
9562 ; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,3,2,3]
9563 ; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3,3,3]
9564 ; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3]
9565 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9566 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
9567 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
9568 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9569 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9570 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
9571 ; AVX-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
9572 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9573 ; AVX-NEXT: # xmm4 = mem[2,3,2,3]
9574 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9575 ; AVX-NEXT: # xmm6 = mem[3,3,3,3]
9576 ; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3]
9577 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
9578 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9579 ; AVX-NEXT: vunpckhps (%rsp), %xmm2, %xmm4 # 16-byte Folded Reload
9580 ; AVX-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3]
9581 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9582 ; AVX-NEXT: # xmm6 = mem[2,3,2,3]
9583 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9584 ; AVX-NEXT: # xmm7 = mem[3,3,3,3]
9585 ; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3]
9586 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
9587 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
9588 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
9589 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
9590 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload
9591 ; AVX-NEXT: # xmm4 = xmm11[2],mem[2],xmm11[3],mem[3]
9592 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9593 ; AVX-NEXT: # xmm6 = mem[2,3,2,3]
9594 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9595 ; AVX-NEXT: # xmm7 = mem[3,3,3,3]
9596 ; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3]
9597 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
9598 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
9599 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
9600 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm5 # 16-byte Folded Reload
9601 ; AVX-NEXT: # xmm5 = xmm13[2],mem[2],xmm13[3],mem[3]
9602 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3]
9603 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9604 ; AVX-NEXT: # xmm3 = mem[3,3,3,3]
9605 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3]
9606 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
9607 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
9608 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9609 ; AVX-NEXT: vmovaps %ymm3, 64(%rsi)
9610 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9611 ; AVX-NEXT: vmovaps %ymm3, (%rsi)
9612 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9613 ; AVX-NEXT: vmovaps %ymm3, 96(%rsi)
9614 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9615 ; AVX-NEXT: vmovaps %ymm3, 32(%rsi)
9616 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9617 ; AVX-NEXT: vmovaps %ymm3, 64(%rdx)
9618 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9619 ; AVX-NEXT: vmovaps %ymm3, (%rdx)
9620 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9621 ; AVX-NEXT: vmovaps %ymm3, 96(%rdx)
9622 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9623 ; AVX-NEXT: vmovaps %ymm3, 32(%rdx)
9624 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9625 ; AVX-NEXT: vmovaps %ymm3, 64(%rcx)
9626 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9627 ; AVX-NEXT: vmovaps %ymm3, (%rcx)
9628 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9629 ; AVX-NEXT: vmovaps %ymm3, 96(%rcx)
9630 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9631 ; AVX-NEXT: vmovaps %ymm3, 32(%rcx)
9632 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9633 ; AVX-NEXT: vmovaps %ymm3, 64(%r8)
9634 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9635 ; AVX-NEXT: vmovaps %ymm3, (%r8)
9636 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9637 ; AVX-NEXT: vmovaps %ymm3, 96(%r8)
9638 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9639 ; AVX-NEXT: vmovaps %ymm3, 32(%r8)
9640 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9641 ; AVX-NEXT: vmovaps %ymm3, 64(%r9)
9642 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9643 ; AVX-NEXT: vmovaps %ymm3, (%r9)
9644 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9645 ; AVX-NEXT: vmovaps %ymm3, 96(%r9)
9646 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9647 ; AVX-NEXT: vmovaps %ymm3, 32(%r9)
9648 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
9649 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9650 ; AVX-NEXT: vmovaps %ymm3, 64(%rax)
9651 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9652 ; AVX-NEXT: vmovaps %ymm3, (%rax)
9653 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9654 ; AVX-NEXT: vmovaps %ymm3, 96(%rax)
9655 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9656 ; AVX-NEXT: vmovaps %ymm3, 32(%rax)
9657 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
9658 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9659 ; AVX-NEXT: vmovaps %ymm3, 64(%rax)
9660 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9661 ; AVX-NEXT: vmovaps %ymm3, (%rax)
9662 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9663 ; AVX-NEXT: vmovaps %ymm3, 96(%rax)
9664 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9665 ; AVX-NEXT: vmovaps %ymm3, 32(%rax)
9666 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
9667 ; AVX-NEXT: vmovaps %ymm1, 96(%rax)
9668 ; AVX-NEXT: vmovaps %ymm0, 64(%rax)
9669 ; AVX-NEXT: vmovaps %ymm14, 32(%rax)
9670 ; AVX-NEXT: vmovaps %ymm2, (%rax)
9671 ; AVX-NEXT: addq $2056, %rsp # imm = 0x808
9672 ; AVX-NEXT: vzeroupper
9675 ; AVX2-LABEL: load_i16_stride8_vf64:
9677 ; AVX2-NEXT: subq $2408, %rsp # imm = 0x968
9678 ; AVX2-NEXT: vmovdqa 448(%rdi), %ymm2
9679 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9680 ; AVX2-NEXT: vmovdqa 480(%rdi), %ymm3
9681 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9682 ; AVX2-NEXT: vmovdqa 272(%rdi), %xmm0
9683 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9684 ; AVX2-NEXT: vmovdqa 256(%rdi), %xmm1
9685 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9686 ; AVX2-NEXT: vmovdqa 304(%rdi), %xmm4
9687 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9688 ; AVX2-NEXT: vmovdqa 288(%rdi), %xmm5
9689 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9690 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
9691 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9692 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9693 ; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9694 ; AVX2-NEXT: vmovdqa 368(%rdi), %xmm0
9695 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9696 ; AVX2-NEXT: vmovdqa 352(%rdi), %xmm1
9697 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9698 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9699 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9700 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
9701 ; AVX2-NEXT: vmovdqa 336(%rdi), %xmm1
9702 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9703 ; AVX2-NEXT: vmovdqa 320(%rdi), %xmm4
9704 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9705 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
9706 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9707 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
9708 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9709 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
9710 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
9711 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,0,2]
9712 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9713 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
9714 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9715 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
9716 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
9717 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9718 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
9719 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9720 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
9721 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
9722 ; AVX2-NEXT: vmovdqa 384(%rdi), %ymm3
9723 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9724 ; AVX2-NEXT: vmovdqa 416(%rdi), %ymm2
9725 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9726 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
9727 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9728 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
9729 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9730 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
9731 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2]
9732 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9733 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
9734 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9735 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
9736 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
9737 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
9738 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9739 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9740 ; AVX2-NEXT: vmovdqa 880(%rdi), %xmm0
9741 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9742 ; AVX2-NEXT: vmovdqa 864(%rdi), %xmm1
9743 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9744 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9745 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9746 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
9747 ; AVX2-NEXT: vmovdqa 848(%rdi), %xmm1
9748 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9749 ; AVX2-NEXT: vmovdqa 832(%rdi), %xmm2
9750 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9751 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
9752 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9753 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
9754 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9755 ; AVX2-NEXT: vmovdqa 784(%rdi), %xmm1
9756 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9757 ; AVX2-NEXT: vmovdqa 768(%rdi), %xmm2
9758 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9759 ; AVX2-NEXT: vmovdqa 816(%rdi), %xmm3
9760 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9761 ; AVX2-NEXT: vmovdqa 800(%rdi), %xmm4
9762 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9763 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
9764 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9765 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
9766 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9767 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
9768 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
9769 ; AVX2-NEXT: vmovdqa 960(%rdi), %ymm2
9770 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9771 ; AVX2-NEXT: vmovdqa 992(%rdi), %ymm1
9772 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9773 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
9774 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9775 ; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,1,2,0,4,5,6,4]
9776 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
9777 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
9778 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9779 ; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4]
9780 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
9781 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
9782 ; AVX2-NEXT: vmovdqa 896(%rdi), %ymm3
9783 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9784 ; AVX2-NEXT: vmovdqa 928(%rdi), %ymm2
9785 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9786 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
9787 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9788 ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7]
9789 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
9790 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2]
9791 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9792 ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[0,2,2,3,4,6,6,7]
9793 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
9794 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
9795 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
9796 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9797 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9798 ; AVX2-NEXT: vmovdqa 112(%rdi), %xmm0
9799 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9800 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm1
9801 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9802 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9803 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9804 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
9805 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm1
9806 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9807 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2
9808 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9809 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
9810 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9811 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
9812 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9813 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
9814 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9815 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2
9816 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9817 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm3
9818 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9819 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm4
9820 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9821 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
9822 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9823 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
9824 ; AVX2-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
9825 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
9826 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3]
9827 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm1
9828 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9829 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm0
9830 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9831 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
9832 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9833 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4]
9834 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
9835 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
9836 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9837 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
9838 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
9839 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5,6],ymm0[7]
9840 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm0
9841 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9842 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm1
9843 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9844 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
9845 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9846 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
9847 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm8 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
9848 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
9849 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9850 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
9851 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
9852 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
9853 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
9854 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
9855 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9856 ; AVX2-NEXT: vmovdqa 624(%rdi), %xmm4
9857 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9858 ; AVX2-NEXT: vmovdqa 608(%rdi), %xmm5
9859 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9860 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
9861 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9862 ; AVX2-NEXT: vpbroadcastd %xmm4, %xmm4
9863 ; AVX2-NEXT: vmovdqa 592(%rdi), %xmm5
9864 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9865 ; AVX2-NEXT: vmovdqa 576(%rdi), %xmm8
9866 ; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9867 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
9868 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9869 ; AVX2-NEXT: vpbroadcastd %xmm5, %xmm5
9870 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
9871 ; AVX2-NEXT: vmovdqa 528(%rdi), %xmm5
9872 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9873 ; AVX2-NEXT: vmovdqa 512(%rdi), %xmm8
9874 ; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9875 ; AVX2-NEXT: vmovdqa 560(%rdi), %xmm9
9876 ; AVX2-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9877 ; AVX2-NEXT: vmovdqa 544(%rdi), %xmm12
9878 ; AVX2-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9879 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
9880 ; AVX2-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9881 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
9882 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9883 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1]
9884 ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm5[0,1],xmm4[2,3]
9885 ; AVX2-NEXT: vmovdqa 704(%rdi), %ymm5
9886 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9887 ; AVX2-NEXT: vmovdqa 736(%rdi), %ymm4
9888 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9889 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,2]
9890 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9891 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
9892 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9893 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
9894 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,1,0,2]
9895 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9896 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
9897 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9898 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
9899 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0,1,2,3,4,5,6],ymm8[7]
9900 ; AVX2-NEXT: vmovdqa 640(%rdi), %ymm9
9901 ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9902 ; AVX2-NEXT: vmovdqa 672(%rdi), %ymm8
9903 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9904 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm8[0,1,0,2]
9905 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9906 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
9907 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9908 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm14 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
9909 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm9[0,1,0,2]
9910 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9911 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
9912 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9913 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
9914 ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
9915 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
9916 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
9917 ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9918 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9919 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,1,1,1]
9920 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9921 ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm8[1],xmm12[2,3]
9922 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9923 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9924 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
9925 ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
9926 ; AVX2-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
9927 ; AVX2-NEXT: # ymm13 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
9928 ; AVX2-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
9929 ; AVX2-NEXT: # ymm14 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
9930 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
9931 ; AVX2-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
9932 ; AVX2-NEXT: # ymm14 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
9933 ; AVX2-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
9934 ; AVX2-NEXT: # ymm15 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
9935 ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
9936 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
9937 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
9938 ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9939 ; AVX2-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
9940 ; AVX2-NEXT: # xmm12 = mem[1,1,1,1]
9941 ; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
9942 ; AVX2-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3]
9943 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9944 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9945 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
9946 ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
9947 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
9948 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
9949 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
9950 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
9951 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
9952 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
9953 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7]
9954 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7]
9955 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9956 ; AVX2-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload
9957 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,1,1]
9958 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9959 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm12[1],xmm6[2,3]
9960 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9961 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9962 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
9963 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
9964 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
9965 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
9966 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
9967 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
9968 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
9969 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
9970 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
9971 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
9972 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9973 ; AVX2-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9974 ; AVX2-NEXT: # xmm0 = mem[1,1,1,1]
9975 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9976 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
9977 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9978 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9979 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
9980 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
9981 ; AVX2-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
9982 ; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
9983 ; AVX2-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
9984 ; AVX2-NEXT: # ymm2 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
9985 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
9986 ; AVX2-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
9987 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
9988 ; AVX2-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
9989 ; AVX2-NEXT: # ymm3 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
9990 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
9991 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
9992 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9993 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9994 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,2,2]
9995 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
9996 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
9997 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
9998 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
9999 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
10000 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10001 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10002 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
10003 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10004 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
10005 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
10006 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10007 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10008 ; AVX2-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
10009 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10010 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10011 ; AVX2-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
10012 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10013 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
10014 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
10015 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10016 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10017 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10018 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10019 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
10020 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
10021 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10022 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
10023 ; AVX2-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
10024 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10025 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
10026 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
10027 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10028 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10029 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
10030 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10031 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
10032 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
10033 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10034 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10035 ; AVX2-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
10036 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10037 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10038 ; AVX2-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
10039 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10040 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
10041 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
10042 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10043 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10044 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10045 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10046 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,2,2,2]
10047 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
10048 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
10049 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10050 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
10051 ; AVX2-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7]
10052 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
10053 ; AVX2-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7]
10054 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
10055 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
10056 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10057 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
10058 ; AVX2-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7]
10059 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
10060 ; AVX2-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
10061 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
10062 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
10063 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10064 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10065 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10066 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10067 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,2,2,2]
10068 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10069 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
10070 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10071 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
10072 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
10073 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
10074 ; AVX2-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
10075 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
10076 ; AVX2-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
10077 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
10078 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
10079 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10080 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
10081 ; AVX2-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
10082 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10083 ; AVX2-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
10084 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
10085 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
10086 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
10087 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
10088 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
10089 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10090 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10091 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10092 ; AVX2-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
10093 ; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10094 ; AVX2-NEXT: # xmm1 = mem[2,3,2,3]
10095 ; AVX2-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
10096 ; AVX2-NEXT: # xmm3 = mem[3,3,3,3]
10097 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
10098 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10099 ; AVX2-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
10100 ; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
10101 ; AVX2-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10102 ; AVX2-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
10103 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
10104 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10105 ; AVX2-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
10106 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
10107 ; AVX2-NEXT: # ymm15 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
10108 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7]
10109 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
10110 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10111 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10112 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10113 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10114 ; AVX2-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
10115 ; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10116 ; AVX2-NEXT: # xmm1 = mem[2,3,2,3]
10117 ; AVX2-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
10118 ; AVX2-NEXT: # xmm3 = mem[3,3,3,3]
10119 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
10120 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10121 ; AVX2-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
10122 ; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
10123 ; AVX2-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10124 ; AVX2-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
10125 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
10126 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10127 ; AVX2-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
10128 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
10129 ; AVX2-NEXT: # ymm11 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
10130 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5],ymm11[6,7]
10131 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
10132 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10133 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10134 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10135 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10136 ; AVX2-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
10137 ; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10138 ; AVX2-NEXT: # xmm1 = mem[2,3,2,3]
10139 ; AVX2-NEXT: vpshufd $255, (%rsp), %xmm3 # 16-byte Folded Reload
10140 ; AVX2-NEXT: # xmm3 = mem[3,3,3,3]
10141 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
10142 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10143 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
10144 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
10145 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
10146 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
10147 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
10148 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7]
10149 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
10150 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10151 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10152 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload
10153 ; AVX2-NEXT: # xmm0 = xmm5[2],mem[2],xmm5[3],mem[3]
10154 ; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10155 ; AVX2-NEXT: # xmm1 = mem[2,3,2,3]
10156 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,3,3,3]
10157 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
10158 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10159 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
10160 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
10161 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
10162 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
10163 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
10164 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
10165 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10166 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10167 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10168 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10169 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
10170 ; AVX2-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10171 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10172 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10173 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
10174 ; AVX2-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10175 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10176 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10177 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
10178 ; AVX2-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10179 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10180 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10181 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
10182 ; AVX2-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10183 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10184 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm0
10185 ; AVX2-NEXT: vpbroadcastd %xmm4, %xmm1
10186 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10187 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
10188 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10189 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
10190 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3]
10191 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10192 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10193 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3]
10194 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10195 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
10196 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10197 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
10198 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
10199 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10200 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
10201 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10202 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10203 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3]
10204 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10205 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10206 ; AVX2-NEXT: # ymm3 = mem[0,1,1,3]
10207 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10208 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
10209 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10210 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
10211 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
10212 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10213 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
10214 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10215 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10216 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10217 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10218 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10219 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
10220 ; AVX2-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10221 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10222 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10223 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
10224 ; AVX2-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10225 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10226 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10227 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
10228 ; AVX2-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10229 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10230 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10231 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
10232 ; AVX2-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10233 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10234 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm0
10235 ; AVX2-NEXT: vpbroadcastd %xmm4, %xmm1
10236 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10237 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
10238 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10239 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
10240 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3]
10241 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10242 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10243 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3]
10244 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10245 ; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,1,2,0,4,5,6,4]
10246 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
10247 ; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4]
10248 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
10249 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10250 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10251 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3]
10252 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10253 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10254 ; AVX2-NEXT: # ymm3 = mem[0,1,1,3]
10255 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10256 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,2,3,4,6,6,7]
10257 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
10258 ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,2,3,4,6,6,7]
10259 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
10260 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10261 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10262 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10263 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10264 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10265 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
10266 ; AVX2-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10267 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10268 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10269 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
10270 ; AVX2-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10271 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10272 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm0
10273 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm1
10274 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10275 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10276 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
10277 ; AVX2-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
10278 ; AVX2-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
10279 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10280 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
10281 ; AVX2-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
10282 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10283 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
10284 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10285 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
10286 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3]
10287 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10288 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10289 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3]
10290 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10291 ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[0,1,2,0,4,5,6,4]
10292 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
10293 ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,2,0,4,5,6,4]
10294 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
10295 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10296 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10297 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3]
10298 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10299 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10300 ; AVX2-NEXT: # ymm3 = mem[0,1,1,3]
10301 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10302 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,2,2,3,4,6,6,7]
10303 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
10304 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
10305 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
10306 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10307 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10308 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10309 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10310 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10311 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
10312 ; AVX2-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10313 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10314 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10315 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
10316 ; AVX2-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10317 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10318 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10319 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
10320 ; AVX2-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10321 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10322 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10323 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
10324 ; AVX2-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10325 ; AVX2-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10326 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm0
10327 ; AVX2-NEXT: vpbroadcastd %xmm12, %xmm1
10328 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10329 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
10330 ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3]
10331 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
10332 ; AVX2-NEXT: # ymm0 = mem[0,1,1,3]
10333 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10334 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
10335 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3]
10336 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10337 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4]
10338 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
10339 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
10340 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
10341 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5,6],ymm0[7]
10342 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
10343 ; AVX2-NEXT: # ymm0 = mem[0,1,1,3]
10344 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10345 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
10346 ; AVX2-NEXT: # ymm15 = mem[0,1,1,3]
10347 ; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10348 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
10349 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
10350 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[0,2,2,3,4,6,6,7]
10351 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
10352 ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
10353 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
10354 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
10355 ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10356 ; AVX2-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
10357 ; AVX2-NEXT: # xmm12 = mem[1,1,1,1]
10358 ; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
10359 ; AVX2-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3]
10360 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
10361 ; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
10362 ; AVX2-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1]
10363 ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
10364 ; AVX2-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
10365 ; AVX2-NEXT: # ymm13 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
10366 ; AVX2-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
10367 ; AVX2-NEXT: # ymm14 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
10368 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
10369 ; AVX2-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
10370 ; AVX2-NEXT: # ymm14 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
10371 ; AVX2-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
10372 ; AVX2-NEXT: # ymm15 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
10373 ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
10374 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
10375 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
10376 ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10377 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10378 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,1,1,1]
10379 ; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
10380 ; AVX2-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3]
10381 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10382 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
10383 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1]
10384 ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
10385 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
10386 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
10387 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
10388 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
10389 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
10390 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7]
10391 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7]
10392 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
10393 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10394 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
10395 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[1,1,1,1]
10396 ; AVX2-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload
10397 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3]
10398 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10399 ; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm9 # 16-byte Folded Reload
10400 ; AVX2-NEXT: # xmm9 = xmm11[0],mem[0],xmm11[1],mem[1]
10401 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3]
10402 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
10403 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
10404 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
10405 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
10406 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
10407 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
10408 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
10409 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
10410 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10411 ; AVX2-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
10412 ; AVX2-NEXT: # xmm4 = mem[1,1,1,1]
10413 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
10414 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm13[1],xmm4[2,3]
10415 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10416 ; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
10417 ; AVX2-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1]
10418 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
10419 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
10420 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
10421 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
10422 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
10423 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
10424 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
10425 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
10426 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
10427 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10428 ; AVX2-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10429 ; AVX2-NEXT: # xmm0 = mem[2,2,2,2]
10430 ; AVX2-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10431 ; AVX2-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
10432 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10433 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
10434 ; AVX2-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
10435 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10436 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
10437 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
10438 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10439 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10440 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
10441 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10442 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
10443 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
10444 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10445 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10446 ; AVX2-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
10447 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10448 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10449 ; AVX2-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
10450 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10451 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
10452 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
10453 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10454 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10455 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10456 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10457 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,2,2]
10458 ; AVX2-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10459 ; AVX2-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
10460 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload
10461 ; AVX2-NEXT: # xmm1 = xmm14[2],mem[2],xmm14[3],mem[3]
10462 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10463 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
10464 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
10465 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10466 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10467 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
10468 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10469 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
10470 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
10471 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10472 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10473 ; AVX2-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
10474 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10475 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10476 ; AVX2-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
10477 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10478 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
10479 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
10480 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10481 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10482 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10483 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10484 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10485 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
10486 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
10487 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm12[2],xmm10[2],xmm12[3],xmm10[3]
10488 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3]
10489 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
10490 ; AVX2-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7]
10491 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
10492 ; AVX2-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
10493 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
10494 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
10495 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,6],ymm0[7]
10496 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
10497 ; AVX2-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
10498 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10499 ; AVX2-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
10500 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
10501 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
10502 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7]
10503 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
10504 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm8[4,5,6,7]
10505 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10506 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
10507 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[2,2,2,2]
10508 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10509 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3]
10510 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
10511 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm10[2],xmm13[2],xmm10[3],xmm13[3]
10512 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm6[2,3]
10513 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
10514 ; AVX2-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7]
10515 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
10516 ; AVX2-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
10517 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
10518 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm11 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
10519 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm6[7]
10520 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
10521 ; AVX2-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
10522 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
10523 ; AVX2-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7]
10524 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
10525 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
10526 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
10527 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
10528 ; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm0[4,5,6,7]
10529 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10530 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
10531 ; AVX2-NEXT: vpshufd $238, (%rsp), %xmm1 # 16-byte Folded Reload
10532 ; AVX2-NEXT: # xmm1 = mem[2,3,2,3]
10533 ; AVX2-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
10534 ; AVX2-NEXT: # xmm2 = mem[3,3,3,3]
10535 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
10536 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10537 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
10538 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
10539 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10540 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
10541 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
10542 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10543 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10544 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10545 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10546 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10547 ; AVX2-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
10548 ; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
10549 ; AVX2-NEXT: # xmm2 = mem[2,3,2,3]
10550 ; AVX2-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
10551 ; AVX2-NEXT: # xmm3 = mem[3,3,3,3]
10552 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3]
10553 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
10554 ; AVX2-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10555 ; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
10556 ; AVX2-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10557 ; AVX2-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
10558 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
10559 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10560 ; AVX2-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
10561 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
10562 ; AVX2-NEXT: # ymm4 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
10563 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
10564 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
10565 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
10566 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm12[2],xmm9[3],xmm12[3]
10567 ; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
10568 ; AVX2-NEXT: # xmm3 = mem[2,3,2,3]
10569 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[3,3,3,3]
10570 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3]
10571 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
10572 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
10573 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
10574 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
10575 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
10576 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
10577 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
10578 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
10579 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
10580 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10581 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
10582 ; AVX2-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
10583 ; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
10584 ; AVX2-NEXT: # xmm4 = mem[2,3,2,3]
10585 ; AVX2-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
10586 ; AVX2-NEXT: # xmm5 = mem[3,3,3,3]
10587 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
10588 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
10589 ; AVX2-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
10590 ; AVX2-NEXT: # ymm4 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
10591 ; AVX2-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
10592 ; AVX2-NEXT: # ymm5 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
10593 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
10594 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
10595 ; AVX2-NEXT: # ymm5 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
10596 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
10597 ; AVX2-NEXT: # ymm6 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
10598 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
10599 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
10600 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
10601 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10602 ; AVX2-NEXT: vmovaps %ymm4, 64(%rsi)
10603 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10604 ; AVX2-NEXT: vmovaps %ymm4, (%rsi)
10605 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10606 ; AVX2-NEXT: vmovaps %ymm4, 96(%rsi)
10607 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10608 ; AVX2-NEXT: vmovaps %ymm4, 32(%rsi)
10609 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10610 ; AVX2-NEXT: vmovaps %ymm4, 64(%rdx)
10611 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10612 ; AVX2-NEXT: vmovaps %ymm4, (%rdx)
10613 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10614 ; AVX2-NEXT: vmovaps %ymm4, 96(%rdx)
10615 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10616 ; AVX2-NEXT: vmovaps %ymm4, 32(%rdx)
10617 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10618 ; AVX2-NEXT: vmovaps %ymm4, 64(%rcx)
10619 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10620 ; AVX2-NEXT: vmovaps %ymm4, (%rcx)
10621 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10622 ; AVX2-NEXT: vmovaps %ymm4, 96(%rcx)
10623 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10624 ; AVX2-NEXT: vmovaps %ymm4, 32(%rcx)
10625 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10626 ; AVX2-NEXT: vmovaps %ymm4, 64(%r8)
10627 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10628 ; AVX2-NEXT: vmovaps %ymm4, (%r8)
10629 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10630 ; AVX2-NEXT: vmovaps %ymm4, 96(%r8)
10631 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10632 ; AVX2-NEXT: vmovaps %ymm4, 32(%r8)
10633 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10634 ; AVX2-NEXT: vmovaps %ymm4, 64(%r9)
10635 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10636 ; AVX2-NEXT: vmovaps %ymm4, (%r9)
10637 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10638 ; AVX2-NEXT: vmovaps %ymm4, 96(%r9)
10639 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10640 ; AVX2-NEXT: vmovaps %ymm4, 32(%r9)
10641 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
10642 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10643 ; AVX2-NEXT: vmovaps %ymm4, 64(%rax)
10644 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10645 ; AVX2-NEXT: vmovaps %ymm4, (%rax)
10646 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10647 ; AVX2-NEXT: vmovaps %ymm4, 96(%rax)
10648 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10649 ; AVX2-NEXT: vmovaps %ymm4, 32(%rax)
10650 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
10651 ; AVX2-NEXT: vmovdqa %ymm15, 64(%rax)
10652 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10653 ; AVX2-NEXT: vmovaps %ymm4, (%rax)
10654 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10655 ; AVX2-NEXT: vmovaps %ymm4, 96(%rax)
10656 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10657 ; AVX2-NEXT: vmovaps %ymm4, 32(%rax)
10658 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
10659 ; AVX2-NEXT: vmovdqa %ymm3, 96(%rax)
10660 ; AVX2-NEXT: vmovdqa %ymm2, 64(%rax)
10661 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rax)
10662 ; AVX2-NEXT: vmovdqa %ymm1, (%rax)
10663 ; AVX2-NEXT: addq $2408, %rsp # imm = 0x968
10664 ; AVX2-NEXT: vzeroupper
10667 ; AVX2-FP-LABEL: load_i16_stride8_vf64:
10668 ; AVX2-FP: # %bb.0:
10669 ; AVX2-FP-NEXT: subq $2408, %rsp # imm = 0x968
10670 ; AVX2-FP-NEXT: vmovdqa 448(%rdi), %ymm2
10671 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10672 ; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm3
10673 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10674 ; AVX2-FP-NEXT: vmovdqa 272(%rdi), %xmm0
10675 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10676 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm1
10677 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10678 ; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm4
10679 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10680 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm5
10681 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10682 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
10683 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10684 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
10685 ; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10686 ; AVX2-FP-NEXT: vmovdqa 368(%rdi), %xmm0
10687 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10688 ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %xmm1
10689 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10690 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
10691 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10692 ; AVX2-FP-NEXT: vpbroadcastd %xmm0, %xmm0
10693 ; AVX2-FP-NEXT: vmovdqa 336(%rdi), %xmm1
10694 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10695 ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm4
10696 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10697 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
10698 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10699 ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %xmm1
10700 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10701 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
10702 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10703 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,0,2]
10704 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10705 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
10706 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10707 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
10708 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
10709 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10710 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
10711 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10712 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
10713 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10714 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm3
10715 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10716 ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm2
10717 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10718 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
10719 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10720 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
10721 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10722 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
10723 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2]
10724 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10725 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
10726 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10727 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
10728 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10729 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10730 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10731 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10732 ; AVX2-FP-NEXT: vmovdqa 880(%rdi), %xmm0
10733 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10734 ; AVX2-FP-NEXT: vmovdqa 864(%rdi), %xmm1
10735 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10736 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
10737 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10738 ; AVX2-FP-NEXT: vpbroadcastd %xmm0, %xmm0
10739 ; AVX2-FP-NEXT: vmovdqa 848(%rdi), %xmm1
10740 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10741 ; AVX2-FP-NEXT: vmovdqa 832(%rdi), %xmm2
10742 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10743 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
10744 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10745 ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %xmm1
10746 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10747 ; AVX2-FP-NEXT: vmovdqa 784(%rdi), %xmm1
10748 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10749 ; AVX2-FP-NEXT: vmovdqa 768(%rdi), %xmm2
10750 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10751 ; AVX2-FP-NEXT: vmovdqa 816(%rdi), %xmm3
10752 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10753 ; AVX2-FP-NEXT: vmovdqa 800(%rdi), %xmm4
10754 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10755 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
10756 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10757 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
10758 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10759 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
10760 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10761 ; AVX2-FP-NEXT: vmovdqa 960(%rdi), %ymm2
10762 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10763 ; AVX2-FP-NEXT: vmovdqa 992(%rdi), %ymm1
10764 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10765 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
10766 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10767 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,1,2,0,4,5,6,4]
10768 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
10769 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
10770 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10771 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4]
10772 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
10773 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10774 ; AVX2-FP-NEXT: vmovdqa 896(%rdi), %ymm3
10775 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10776 ; AVX2-FP-NEXT: vmovdqa 928(%rdi), %ymm2
10777 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10778 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
10779 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10780 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7]
10781 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
10782 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2]
10783 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10784 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[0,2,2,3,4,6,6,7]
10785 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
10786 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10787 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10788 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10789 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10790 ; AVX2-FP-NEXT: vmovdqa 112(%rdi), %xmm0
10791 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10792 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm1
10793 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10794 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
10795 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10796 ; AVX2-FP-NEXT: vpbroadcastd %xmm0, %xmm0
10797 ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm1
10798 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10799 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm2
10800 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10801 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
10802 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10803 ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %xmm1
10804 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10805 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1
10806 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10807 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm2
10808 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10809 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm3
10810 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10811 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm4
10812 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10813 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
10814 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10815 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
10816 ; AVX2-FP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
10817 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
10818 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3]
10819 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm1
10820 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10821 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm0
10822 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10823 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
10824 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10825 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4]
10826 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
10827 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
10828 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10829 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
10830 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
10831 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5,6],ymm0[7]
10832 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm0
10833 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10834 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm1
10835 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10836 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
10837 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10838 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
10839 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
10840 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
10841 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10842 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
10843 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
10844 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
10845 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
10846 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
10847 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10848 ; AVX2-FP-NEXT: vmovdqa 624(%rdi), %xmm4
10849 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10850 ; AVX2-FP-NEXT: vmovdqa 608(%rdi), %xmm5
10851 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10852 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
10853 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10854 ; AVX2-FP-NEXT: vpbroadcastd %xmm4, %xmm4
10855 ; AVX2-FP-NEXT: vmovdqa 592(%rdi), %xmm5
10856 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10857 ; AVX2-FP-NEXT: vmovdqa 576(%rdi), %xmm8
10858 ; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10859 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
10860 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10861 ; AVX2-FP-NEXT: vpbroadcastd %xmm5, %xmm5
10862 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
10863 ; AVX2-FP-NEXT: vmovdqa 528(%rdi), %xmm5
10864 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10865 ; AVX2-FP-NEXT: vmovdqa 512(%rdi), %xmm8
10866 ; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10867 ; AVX2-FP-NEXT: vmovdqa 560(%rdi), %xmm9
10868 ; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10869 ; AVX2-FP-NEXT: vmovdqa 544(%rdi), %xmm12
10870 ; AVX2-FP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10871 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
10872 ; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10873 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
10874 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10875 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1]
10876 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm5[0,1],xmm4[2,3]
10877 ; AVX2-FP-NEXT: vmovdqa 704(%rdi), %ymm5
10878 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10879 ; AVX2-FP-NEXT: vmovdqa 736(%rdi), %ymm4
10880 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10881 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,2]
10882 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10883 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
10884 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10885 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
10886 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,1,0,2]
10887 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10888 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
10889 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10890 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
10891 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0,1,2,3,4,5,6],ymm8[7]
10892 ; AVX2-FP-NEXT: vmovdqa 640(%rdi), %ymm9
10893 ; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10894 ; AVX2-FP-NEXT: vmovdqa 672(%rdi), %ymm8
10895 ; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10896 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm8[0,1,0,2]
10897 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10898 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
10899 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10900 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
10901 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm9[0,1,0,2]
10902 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10903 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
10904 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10905 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
10906 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
10907 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
10908 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
10909 ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10910 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10911 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,1,1,1]
10912 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10913 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm8[1],xmm12[2,3]
10914 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10915 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10916 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
10917 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
10918 ; AVX2-FP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
10919 ; AVX2-FP-NEXT: # ymm13 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
10920 ; AVX2-FP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
10921 ; AVX2-FP-NEXT: # ymm14 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
10922 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
10923 ; AVX2-FP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
10924 ; AVX2-FP-NEXT: # ymm14 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
10925 ; AVX2-FP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
10926 ; AVX2-FP-NEXT: # ymm15 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
10927 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
10928 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
10929 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
10930 ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10931 ; AVX2-FP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
10932 ; AVX2-FP-NEXT: # xmm12 = mem[1,1,1,1]
10933 ; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
10934 ; AVX2-FP-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3]
10935 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10936 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10937 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
10938 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
10939 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
10940 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
10941 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
10942 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
10943 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
10944 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
10945 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7]
10946 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7]
10947 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10948 ; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload
10949 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,1,1]
10950 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
10951 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm12[1],xmm6[2,3]
10952 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
10953 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10954 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
10955 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
10956 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
10957 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
10958 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
10959 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
10960 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
10961 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
10962 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
10963 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
10964 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10965 ; AVX2-FP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10966 ; AVX2-FP-NEXT: # xmm0 = mem[1,1,1,1]
10967 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10968 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
10969 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10970 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10971 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
10972 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
10973 ; AVX2-FP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
10974 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
10975 ; AVX2-FP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10976 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
10977 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10978 ; AVX2-FP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10979 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
10980 ; AVX2-FP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10981 ; AVX2-FP-NEXT: # ymm3 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
10982 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10983 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10984 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10985 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10986 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,2,2]
10987 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
10988 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
10989 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10990 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
10991 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
10992 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10993 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10994 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
10995 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10996 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
10997 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
10998 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10999 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11000 ; AVX2-FP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
11001 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11002 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11003 ; AVX2-FP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
11004 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11005 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
11006 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
11007 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11008 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11009 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11010 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11011 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
11012 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
11013 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11014 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11015 ; AVX2-FP-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
11016 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11017 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
11018 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
11019 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11020 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11021 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
11022 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11023 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
11024 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
11025 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11026 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11027 ; AVX2-FP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
11028 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11029 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11030 ; AVX2-FP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
11031 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11032 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
11033 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
11034 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11035 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11036 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11037 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11038 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,2,2,2]
11039 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
11040 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
11041 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11042 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
11043 ; AVX2-FP-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7]
11044 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
11045 ; AVX2-FP-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7]
11046 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
11047 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
11048 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11049 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
11050 ; AVX2-FP-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7]
11051 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
11052 ; AVX2-FP-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
11053 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
11054 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
11055 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11056 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11057 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11058 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11059 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,2,2,2]
11060 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11061 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
11062 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11063 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
11064 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
11065 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
11066 ; AVX2-FP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
11067 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
11068 ; AVX2-FP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
11069 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
11070 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
11071 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11072 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
11073 ; AVX2-FP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
11074 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11075 ; AVX2-FP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
11076 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
11077 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
11078 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
11079 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
11080 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
11081 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11082 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11083 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11084 ; AVX2-FP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
11085 ; AVX2-FP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
11086 ; AVX2-FP-NEXT: # xmm1 = mem[2,3,2,3]
11087 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
11088 ; AVX2-FP-NEXT: # xmm3 = mem[3,3,3,3]
11089 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
11090 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11091 ; AVX2-FP-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
11092 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
11093 ; AVX2-FP-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11094 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
11095 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
11096 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11097 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
11098 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
11099 ; AVX2-FP-NEXT: # ymm15 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
11100 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7]
11101 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
11102 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11103 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11104 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11105 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11106 ; AVX2-FP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
11107 ; AVX2-FP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
11108 ; AVX2-FP-NEXT: # xmm1 = mem[2,3,2,3]
11109 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
11110 ; AVX2-FP-NEXT: # xmm3 = mem[3,3,3,3]
11111 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
11112 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11113 ; AVX2-FP-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
11114 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
11115 ; AVX2-FP-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11116 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
11117 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
11118 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11119 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
11120 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
11121 ; AVX2-FP-NEXT: # ymm11 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
11122 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5],ymm11[6,7]
11123 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
11124 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11125 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11126 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11127 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11128 ; AVX2-FP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
11129 ; AVX2-FP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
11130 ; AVX2-FP-NEXT: # xmm1 = mem[2,3,2,3]
11131 ; AVX2-FP-NEXT: vpshufd $255, (%rsp), %xmm3 # 16-byte Folded Reload
11132 ; AVX2-FP-NEXT: # xmm3 = mem[3,3,3,3]
11133 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
11134 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11135 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
11136 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
11137 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
11138 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
11139 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
11140 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7]
11141 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
11142 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11143 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11144 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload
11145 ; AVX2-FP-NEXT: # xmm0 = xmm5[2],mem[2],xmm5[3],mem[3]
11146 ; AVX2-FP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
11147 ; AVX2-FP-NEXT: # xmm1 = mem[2,3,2,3]
11148 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,3,3,3]
11149 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
11150 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11151 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
11152 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
11153 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
11154 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
11155 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
11156 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
11157 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11158 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11159 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11160 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11161 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
11162 ; AVX2-FP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11163 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11164 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11165 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
11166 ; AVX2-FP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11167 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11168 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11169 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
11170 ; AVX2-FP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11171 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11172 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11173 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
11174 ; AVX2-FP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11175 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11176 ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %xmm0
11177 ; AVX2-FP-NEXT: vpbroadcastd %xmm4, %xmm1
11178 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11179 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
11180 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11181 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
11182 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3]
11183 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11184 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11185 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3]
11186 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11187 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
11188 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11189 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
11190 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
11191 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11192 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
11193 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11194 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11195 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3]
11196 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11197 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11198 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,1,3]
11199 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11200 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
11201 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11202 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
11203 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
11204 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11205 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
11206 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11207 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11208 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11209 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11210 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11211 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
11212 ; AVX2-FP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11213 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11214 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11215 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
11216 ; AVX2-FP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11217 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11218 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11219 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
11220 ; AVX2-FP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11221 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11222 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11223 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
11224 ; AVX2-FP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11225 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11226 ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %xmm0
11227 ; AVX2-FP-NEXT: vpbroadcastd %xmm4, %xmm1
11228 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11229 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
11230 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11231 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
11232 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3]
11233 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11234 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11235 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3]
11236 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11237 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,1,2,0,4,5,6,4]
11238 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
11239 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4]
11240 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
11241 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11242 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11243 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3]
11244 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11245 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11246 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,1,3]
11247 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11248 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,2,3,4,6,6,7]
11249 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
11250 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,2,3,4,6,6,7]
11251 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
11252 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11253 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11254 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11255 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11256 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11257 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
11258 ; AVX2-FP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11259 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11260 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11261 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
11262 ; AVX2-FP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11263 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11264 ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %xmm0
11265 ; AVX2-FP-NEXT: vpbroadcastd %xmm2, %xmm1
11266 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11267 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11268 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
11269 ; AVX2-FP-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
11270 ; AVX2-FP-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
11271 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11272 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11273 ; AVX2-FP-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
11274 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11275 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
11276 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11277 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
11278 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3]
11279 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11280 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11281 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3]
11282 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11283 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[0,1,2,0,4,5,6,4]
11284 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
11285 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,2,0,4,5,6,4]
11286 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
11287 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11288 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11289 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3]
11290 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11291 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11292 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,1,3]
11293 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11294 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,2,2,3,4,6,6,7]
11295 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
11296 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
11297 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
11298 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11299 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11300 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11301 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11302 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11303 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
11304 ; AVX2-FP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11305 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11306 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11307 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
11308 ; AVX2-FP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11309 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11310 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11311 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
11312 ; AVX2-FP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11313 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11314 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11315 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
11316 ; AVX2-FP-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11317 ; AVX2-FP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11318 ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %xmm0
11319 ; AVX2-FP-NEXT: vpbroadcastd %xmm12, %xmm1
11320 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11321 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
11322 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3]
11323 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
11324 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,1,3]
11325 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11326 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
11327 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3]
11328 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11329 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4]
11330 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
11331 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
11332 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
11333 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5,6],ymm0[7]
11334 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
11335 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,1,3]
11336 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11337 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
11338 ; AVX2-FP-NEXT: # ymm15 = mem[0,1,1,3]
11339 ; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11340 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
11341 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
11342 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[0,2,2,3,4,6,6,7]
11343 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
11344 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
11345 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
11346 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
11347 ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11348 ; AVX2-FP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
11349 ; AVX2-FP-NEXT: # xmm12 = mem[1,1,1,1]
11350 ; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
11351 ; AVX2-FP-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3]
11352 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
11353 ; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
11354 ; AVX2-FP-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1]
11355 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
11356 ; AVX2-FP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
11357 ; AVX2-FP-NEXT: # ymm13 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
11358 ; AVX2-FP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
11359 ; AVX2-FP-NEXT: # ymm14 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
11360 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
11361 ; AVX2-FP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
11362 ; AVX2-FP-NEXT: # ymm14 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
11363 ; AVX2-FP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
11364 ; AVX2-FP-NEXT: # ymm15 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
11365 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
11366 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
11367 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
11368 ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11369 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
11370 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,1,1,1]
11371 ; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
11372 ; AVX2-FP-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3]
11373 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
11374 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
11375 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1]
11376 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
11377 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
11378 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
11379 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
11380 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
11381 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
11382 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7]
11383 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7]
11384 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
11385 ; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11386 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
11387 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[1,1,1,1]
11388 ; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload
11389 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3]
11390 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
11391 ; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm9 # 16-byte Folded Reload
11392 ; AVX2-FP-NEXT: # xmm9 = xmm11[0],mem[0],xmm11[1],mem[1]
11393 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3]
11394 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
11395 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
11396 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
11397 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
11398 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
11399 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
11400 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
11401 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
11402 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11403 ; AVX2-FP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
11404 ; AVX2-FP-NEXT: # xmm4 = mem[1,1,1,1]
11405 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
11406 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm13[1],xmm4[2,3]
11407 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11408 ; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
11409 ; AVX2-FP-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1]
11410 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
11411 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
11412 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
11413 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
11414 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
11415 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
11416 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
11417 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
11418 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
11419 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11420 ; AVX2-FP-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
11421 ; AVX2-FP-NEXT: # xmm0 = mem[2,2,2,2]
11422 ; AVX2-FP-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11423 ; AVX2-FP-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
11424 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11425 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11426 ; AVX2-FP-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
11427 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11428 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
11429 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
11430 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11431 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11432 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
11433 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11434 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
11435 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
11436 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11437 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11438 ; AVX2-FP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
11439 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11440 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11441 ; AVX2-FP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
11442 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11443 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
11444 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
11445 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11446 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11447 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11448 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11449 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,2,2]
11450 ; AVX2-FP-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11451 ; AVX2-FP-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
11452 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload
11453 ; AVX2-FP-NEXT: # xmm1 = xmm14[2],mem[2],xmm14[3],mem[3]
11454 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11455 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
11456 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
11457 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11458 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11459 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
11460 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11461 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
11462 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
11463 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11464 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11465 ; AVX2-FP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
11466 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11467 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11468 ; AVX2-FP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
11469 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11470 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
11471 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
11472 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11473 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11474 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11475 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11476 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
11477 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
11478 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
11479 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm12[2],xmm10[2],xmm12[3],xmm10[3]
11480 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3]
11481 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
11482 ; AVX2-FP-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7]
11483 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
11484 ; AVX2-FP-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
11485 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
11486 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
11487 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,6],ymm0[7]
11488 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
11489 ; AVX2-FP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
11490 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11491 ; AVX2-FP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
11492 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
11493 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
11494 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7]
11495 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
11496 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm8[4,5,6,7]
11497 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11498 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
11499 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[2,2,2,2]
11500 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
11501 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3]
11502 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11503 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm10[2],xmm13[2],xmm10[3],xmm13[3]
11504 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm6[2,3]
11505 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
11506 ; AVX2-FP-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7]
11507 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
11508 ; AVX2-FP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
11509 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
11510 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
11511 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm6[7]
11512 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
11513 ; AVX2-FP-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
11514 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
11515 ; AVX2-FP-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7]
11516 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
11517 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
11518 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
11519 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
11520 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11521 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11522 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
11523 ; AVX2-FP-NEXT: vpshufd $238, (%rsp), %xmm1 # 16-byte Folded Reload
11524 ; AVX2-FP-NEXT: # xmm1 = mem[2,3,2,3]
11525 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
11526 ; AVX2-FP-NEXT: # xmm2 = mem[3,3,3,3]
11527 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
11528 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11529 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
11530 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
11531 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11532 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
11533 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
11534 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11535 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11536 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11537 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11538 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11539 ; AVX2-FP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
11540 ; AVX2-FP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
11541 ; AVX2-FP-NEXT: # xmm2 = mem[2,3,2,3]
11542 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
11543 ; AVX2-FP-NEXT: # xmm3 = mem[3,3,3,3]
11544 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3]
11545 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
11546 ; AVX2-FP-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11547 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
11548 ; AVX2-FP-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11549 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
11550 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
11551 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11552 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
11553 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
11554 ; AVX2-FP-NEXT: # ymm4 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
11555 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
11556 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
11557 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
11558 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm12[2],xmm9[3],xmm12[3]
11559 ; AVX2-FP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
11560 ; AVX2-FP-NEXT: # xmm3 = mem[2,3,2,3]
11561 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[3,3,3,3]
11562 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3]
11563 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
11564 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
11565 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
11566 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
11567 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
11568 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
11569 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
11570 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
11571 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
11572 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
11573 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
11574 ; AVX2-FP-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
11575 ; AVX2-FP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
11576 ; AVX2-FP-NEXT: # xmm4 = mem[2,3,2,3]
11577 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
11578 ; AVX2-FP-NEXT: # xmm5 = mem[3,3,3,3]
11579 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
11580 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
11581 ; AVX2-FP-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
11582 ; AVX2-FP-NEXT: # ymm4 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
11583 ; AVX2-FP-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
11584 ; AVX2-FP-NEXT: # ymm5 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
11585 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
11586 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
11587 ; AVX2-FP-NEXT: # ymm5 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
11588 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
11589 ; AVX2-FP-NEXT: # ymm6 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
11590 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
11591 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
11592 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
11593 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11594 ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rsi)
11595 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11596 ; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi)
11597 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11598 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rsi)
11599 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11600 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rsi)
11601 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11602 ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rdx)
11603 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11604 ; AVX2-FP-NEXT: vmovaps %ymm4, (%rdx)
11605 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11606 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rdx)
11607 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11608 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rdx)
11609 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11610 ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rcx)
11611 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11612 ; AVX2-FP-NEXT: vmovaps %ymm4, (%rcx)
11613 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11614 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rcx)
11615 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11616 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rcx)
11617 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11618 ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%r8)
11619 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11620 ; AVX2-FP-NEXT: vmovaps %ymm4, (%r8)
11621 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11622 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%r8)
11623 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11624 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%r8)
11625 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11626 ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%r9)
11627 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11628 ; AVX2-FP-NEXT: vmovaps %ymm4, (%r9)
11629 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11630 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%r9)
11631 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11632 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%r9)
11633 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
11634 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11635 ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rax)
11636 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11637 ; AVX2-FP-NEXT: vmovaps %ymm4, (%rax)
11638 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11639 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rax)
11640 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11641 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rax)
11642 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
11643 ; AVX2-FP-NEXT: vmovdqa %ymm15, 64(%rax)
11644 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11645 ; AVX2-FP-NEXT: vmovaps %ymm4, (%rax)
11646 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11647 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rax)
11648 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11649 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rax)
11650 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
11651 ; AVX2-FP-NEXT: vmovdqa %ymm3, 96(%rax)
11652 ; AVX2-FP-NEXT: vmovdqa %ymm2, 64(%rax)
11653 ; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rax)
11654 ; AVX2-FP-NEXT: vmovdqa %ymm1, (%rax)
11655 ; AVX2-FP-NEXT: addq $2408, %rsp # imm = 0x968
11656 ; AVX2-FP-NEXT: vzeroupper
11657 ; AVX2-FP-NEXT: retq
11659 ; AVX2-FCP-LABEL: load_i16_stride8_vf64:
11660 ; AVX2-FCP: # %bb.0:
11661 ; AVX2-FCP-NEXT: subq $2408, %rsp # imm = 0x968
11662 ; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm2
11663 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11664 ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm3
11665 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11666 ; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm0
11667 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11668 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm1
11669 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11670 ; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm4
11671 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11672 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm5
11673 ; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11674 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
11675 ; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11676 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
11677 ; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11678 ; AVX2-FCP-NEXT: vmovdqa 368(%rdi), %xmm0
11679 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11680 ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %xmm1
11681 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11682 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
11683 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11684 ; AVX2-FCP-NEXT: vpbroadcastd %xmm0, %xmm0
11685 ; AVX2-FCP-NEXT: vmovdqa 336(%rdi), %xmm1
11686 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11687 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm4
11688 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11689 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
11690 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11691 ; AVX2-FCP-NEXT: vpbroadcastd %xmm1, %xmm1
11692 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11693 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
11694 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11695 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,0,2]
11696 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11697 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
11698 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11699 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
11700 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
11701 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11702 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
11703 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11704 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
11705 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11706 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm3
11707 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11708 ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm2
11709 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11710 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
11711 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11712 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
11713 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11714 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
11715 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2]
11716 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11717 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
11718 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11719 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
11720 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11721 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11722 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11723 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11724 ; AVX2-FCP-NEXT: vmovdqa 880(%rdi), %xmm0
11725 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11726 ; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %xmm1
11727 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11728 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
11729 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11730 ; AVX2-FCP-NEXT: vpbroadcastd %xmm0, %xmm0
11731 ; AVX2-FCP-NEXT: vmovdqa 848(%rdi), %xmm1
11732 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11733 ; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %xmm2
11734 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11735 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
11736 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11737 ; AVX2-FCP-NEXT: vpbroadcastd %xmm1, %xmm1
11738 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11739 ; AVX2-FCP-NEXT: vmovdqa 784(%rdi), %xmm1
11740 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11741 ; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %xmm2
11742 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11743 ; AVX2-FCP-NEXT: vmovdqa 816(%rdi), %xmm3
11744 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11745 ; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %xmm4
11746 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11747 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
11748 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11749 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
11750 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11751 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
11752 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11753 ; AVX2-FCP-NEXT: vmovdqa 960(%rdi), %ymm2
11754 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11755 ; AVX2-FCP-NEXT: vmovdqa 992(%rdi), %ymm1
11756 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11757 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
11758 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11759 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,1,2,0,4,5,6,4]
11760 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
11761 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
11762 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11763 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4]
11764 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
11765 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11766 ; AVX2-FCP-NEXT: vmovdqa 896(%rdi), %ymm3
11767 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11768 ; AVX2-FCP-NEXT: vmovdqa 928(%rdi), %ymm2
11769 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11770 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
11771 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11772 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7]
11773 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
11774 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2]
11775 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11776 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[0,2,2,3,4,6,6,7]
11777 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
11778 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11779 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11780 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11781 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11782 ; AVX2-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
11783 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11784 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
11785 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11786 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
11787 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11788 ; AVX2-FCP-NEXT: vpbroadcastd %xmm0, %xmm0
11789 ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm1
11790 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11791 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
11792 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11793 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
11794 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11795 ; AVX2-FCP-NEXT: vpbroadcastd %xmm1, %xmm1
11796 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11797 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1
11798 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11799 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
11800 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11801 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
11802 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11803 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm4
11804 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11805 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
11806 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11807 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
11808 ; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
11809 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
11810 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3]
11811 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm1
11812 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11813 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm0
11814 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11815 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
11816 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11817 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4]
11818 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
11819 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
11820 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11821 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
11822 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
11823 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5,6],ymm0[7]
11824 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm0
11825 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11826 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1
11827 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11828 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
11829 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11830 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
11831 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
11832 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
11833 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11834 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
11835 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
11836 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
11837 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
11838 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
11839 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11840 ; AVX2-FCP-NEXT: vmovdqa 624(%rdi), %xmm4
11841 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11842 ; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %xmm5
11843 ; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11844 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
11845 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11846 ; AVX2-FCP-NEXT: vpbroadcastd %xmm4, %xmm4
11847 ; AVX2-FCP-NEXT: vmovdqa 592(%rdi), %xmm5
11848 ; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11849 ; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %xmm8
11850 ; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11851 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
11852 ; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11853 ; AVX2-FCP-NEXT: vpbroadcastd %xmm5, %xmm5
11854 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
11855 ; AVX2-FCP-NEXT: vmovdqa 528(%rdi), %xmm5
11856 ; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11857 ; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %xmm8
11858 ; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11859 ; AVX2-FCP-NEXT: vmovdqa 560(%rdi), %xmm9
11860 ; AVX2-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11861 ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %xmm12
11862 ; AVX2-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11863 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
11864 ; AVX2-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11865 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
11866 ; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11867 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1]
11868 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm5[0,1],xmm4[2,3]
11869 ; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %ymm5
11870 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11871 ; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %ymm4
11872 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11873 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,2]
11874 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11875 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
11876 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11877 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
11878 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,1,0,2]
11879 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11880 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
11881 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11882 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
11883 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0,1,2,3,4,5,6],ymm8[7]
11884 ; AVX2-FCP-NEXT: vmovdqa 640(%rdi), %ymm9
11885 ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11886 ; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %ymm8
11887 ; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11888 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm8[0,1,0,2]
11889 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11890 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
11891 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11892 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
11893 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm9[0,1,0,2]
11894 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11895 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
11896 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11897 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
11898 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
11899 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
11900 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
11901 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11902 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
11903 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,1,1,1]
11904 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
11905 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm8[1],xmm12[2,3]
11906 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
11907 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11908 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
11909 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
11910 ; AVX2-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
11911 ; AVX2-FCP-NEXT: # ymm13 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
11912 ; AVX2-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
11913 ; AVX2-FCP-NEXT: # ymm14 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
11914 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
11915 ; AVX2-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
11916 ; AVX2-FCP-NEXT: # ymm14 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
11917 ; AVX2-FCP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
11918 ; AVX2-FCP-NEXT: # ymm15 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
11919 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
11920 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
11921 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
11922 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11923 ; AVX2-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
11924 ; AVX2-FCP-NEXT: # xmm12 = mem[1,1,1,1]
11925 ; AVX2-FCP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
11926 ; AVX2-FCP-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3]
11927 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
11928 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
11929 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
11930 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
11931 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
11932 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
11933 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
11934 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
11935 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
11936 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
11937 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7]
11938 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7]
11939 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11940 ; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload
11941 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,1,1]
11942 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
11943 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm12[1],xmm6[2,3]
11944 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11945 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
11946 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
11947 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
11948 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
11949 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
11950 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
11951 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
11952 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
11953 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
11954 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
11955 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
11956 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11957 ; AVX2-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
11958 ; AVX2-FCP-NEXT: # xmm0 = mem[1,1,1,1]
11959 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
11960 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
11961 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11962 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11963 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
11964 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
11965 ; AVX2-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
11966 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
11967 ; AVX2-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11968 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
11969 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11970 ; AVX2-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11971 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
11972 ; AVX2-FCP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11973 ; AVX2-FCP-NEXT: # ymm3 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
11974 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11975 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11976 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11977 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11978 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,2,2]
11979 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
11980 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
11981 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11982 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
11983 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
11984 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11985 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11986 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
11987 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11988 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
11989 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
11990 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11991 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11992 ; AVX2-FCP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
11993 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11994 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11995 ; AVX2-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
11996 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11997 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
11998 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
11999 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12000 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12001 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12002 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12003 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
12004 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
12005 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12006 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
12007 ; AVX2-FCP-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
12008 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12009 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
12010 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
12011 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12012 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12013 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
12014 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12015 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
12016 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
12017 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12018 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12019 ; AVX2-FCP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
12020 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12021 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12022 ; AVX2-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
12023 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12024 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
12025 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
12026 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12027 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12028 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12029 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12030 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,2,2,2]
12031 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
12032 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
12033 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12034 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
12035 ; AVX2-FCP-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7]
12036 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
12037 ; AVX2-FCP-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7]
12038 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
12039 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
12040 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12041 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
12042 ; AVX2-FCP-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7]
12043 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
12044 ; AVX2-FCP-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
12045 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
12046 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
12047 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12048 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12049 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12050 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12051 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,2,2,2]
12052 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12053 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
12054 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12055 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
12056 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
12057 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
12058 ; AVX2-FCP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
12059 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
12060 ; AVX2-FCP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
12061 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
12062 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
12063 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12064 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
12065 ; AVX2-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
12066 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12067 ; AVX2-FCP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
12068 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
12069 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
12070 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
12071 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
12072 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
12073 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12074 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12075 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12076 ; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
12077 ; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
12078 ; AVX2-FCP-NEXT: # xmm1 = mem[2,3,2,3]
12079 ; AVX2-FCP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
12080 ; AVX2-FCP-NEXT: # xmm3 = mem[3,3,3,3]
12081 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
12082 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12083 ; AVX2-FCP-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
12084 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
12085 ; AVX2-FCP-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12086 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
12087 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
12088 ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12089 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
12090 ; AVX2-FCP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
12091 ; AVX2-FCP-NEXT: # ymm15 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
12092 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7]
12093 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
12094 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12095 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12096 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12097 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12098 ; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
12099 ; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
12100 ; AVX2-FCP-NEXT: # xmm1 = mem[2,3,2,3]
12101 ; AVX2-FCP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
12102 ; AVX2-FCP-NEXT: # xmm3 = mem[3,3,3,3]
12103 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
12104 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12105 ; AVX2-FCP-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
12106 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
12107 ; AVX2-FCP-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12108 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
12109 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
12110 ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12111 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
12112 ; AVX2-FCP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
12113 ; AVX2-FCP-NEXT: # ymm11 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
12114 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5],ymm11[6,7]
12115 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
12116 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12117 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12118 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12119 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12120 ; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
12121 ; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
12122 ; AVX2-FCP-NEXT: # xmm1 = mem[2,3,2,3]
12123 ; AVX2-FCP-NEXT: vpshufd $255, (%rsp), %xmm3 # 16-byte Folded Reload
12124 ; AVX2-FCP-NEXT: # xmm3 = mem[3,3,3,3]
12125 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
12126 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12127 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
12128 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
12129 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
12130 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
12131 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
12132 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7]
12133 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
12134 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12135 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12136 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload
12137 ; AVX2-FCP-NEXT: # xmm0 = xmm5[2],mem[2],xmm5[3],mem[3]
12138 ; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
12139 ; AVX2-FCP-NEXT: # xmm1 = mem[2,3,2,3]
12140 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,3,3,3]
12141 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
12142 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12143 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
12144 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
12145 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
12146 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
12147 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
12148 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
12149 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12150 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12151 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12152 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12153 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
12154 ; AVX2-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12155 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12156 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12157 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
12158 ; AVX2-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12159 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12160 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12161 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12162 ; AVX2-FCP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12163 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12164 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12165 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
12166 ; AVX2-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12167 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12168 ; AVX2-FCP-NEXT: vpbroadcastd %xmm1, %xmm0
12169 ; AVX2-FCP-NEXT: vpbroadcastd %xmm4, %xmm1
12170 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
12171 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
12172 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12173 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
12174 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3]
12175 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12176 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12177 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3]
12178 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12179 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
12180 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12181 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
12182 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
12183 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12184 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
12185 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12186 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12187 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3]
12188 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12189 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12190 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,1,3]
12191 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12192 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
12193 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12194 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
12195 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
12196 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12197 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
12198 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12199 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12200 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12201 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12202 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12203 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
12204 ; AVX2-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12205 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12206 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12207 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
12208 ; AVX2-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12209 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12210 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12211 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12212 ; AVX2-FCP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12213 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12214 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12215 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
12216 ; AVX2-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12217 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12218 ; AVX2-FCP-NEXT: vpbroadcastd %xmm1, %xmm0
12219 ; AVX2-FCP-NEXT: vpbroadcastd %xmm4, %xmm1
12220 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
12221 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
12222 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12223 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
12224 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3]
12225 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12226 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12227 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3]
12228 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12229 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,1,2,0,4,5,6,4]
12230 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
12231 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4]
12232 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
12233 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12234 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12235 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3]
12236 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12237 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12238 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,1,3]
12239 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12240 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,2,3,4,6,6,7]
12241 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
12242 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,2,3,4,6,6,7]
12243 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
12244 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12245 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12246 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12247 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12248 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12249 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12250 ; AVX2-FCP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12251 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12252 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12253 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
12254 ; AVX2-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12255 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12256 ; AVX2-FCP-NEXT: vpbroadcastd %xmm1, %xmm0
12257 ; AVX2-FCP-NEXT: vpbroadcastd %xmm2, %xmm1
12258 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
12259 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12260 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
12261 ; AVX2-FCP-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
12262 ; AVX2-FCP-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
12263 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12264 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
12265 ; AVX2-FCP-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
12266 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12267 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
12268 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12269 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
12270 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3]
12271 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12272 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12273 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3]
12274 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12275 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[0,1,2,0,4,5,6,4]
12276 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
12277 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,2,0,4,5,6,4]
12278 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
12279 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12280 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12281 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3]
12282 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12283 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12284 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,1,3]
12285 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12286 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,2,2,3,4,6,6,7]
12287 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
12288 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
12289 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
12290 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12291 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12292 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12293 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12294 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12295 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
12296 ; AVX2-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12297 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12298 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12299 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
12300 ; AVX2-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12301 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12302 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12303 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12304 ; AVX2-FCP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12305 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12306 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12307 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
12308 ; AVX2-FCP-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12309 ; AVX2-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12310 ; AVX2-FCP-NEXT: vpbroadcastd %xmm1, %xmm0
12311 ; AVX2-FCP-NEXT: vpbroadcastd %xmm12, %xmm1
12312 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
12313 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
12314 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3]
12315 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
12316 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,1,3]
12317 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12318 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
12319 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3]
12320 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12321 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4]
12322 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
12323 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
12324 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
12325 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5,6],ymm0[7]
12326 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
12327 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,1,3]
12328 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12329 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
12330 ; AVX2-FCP-NEXT: # ymm15 = mem[0,1,1,3]
12331 ; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12332 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
12333 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
12334 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[0,2,2,3,4,6,6,7]
12335 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
12336 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
12337 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
12338 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
12339 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12340 ; AVX2-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
12341 ; AVX2-FCP-NEXT: # xmm12 = mem[1,1,1,1]
12342 ; AVX2-FCP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
12343 ; AVX2-FCP-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3]
12344 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
12345 ; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
12346 ; AVX2-FCP-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1]
12347 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
12348 ; AVX2-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
12349 ; AVX2-FCP-NEXT: # ymm13 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
12350 ; AVX2-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
12351 ; AVX2-FCP-NEXT: # ymm14 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
12352 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
12353 ; AVX2-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
12354 ; AVX2-FCP-NEXT: # ymm14 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
12355 ; AVX2-FCP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
12356 ; AVX2-FCP-NEXT: # ymm15 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
12357 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
12358 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
12359 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
12360 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12361 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
12362 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,1,1,1]
12363 ; AVX2-FCP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
12364 ; AVX2-FCP-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3]
12365 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
12366 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
12367 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1]
12368 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
12369 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
12370 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
12371 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
12372 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
12373 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
12374 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7]
12375 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7]
12376 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
12377 ; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12378 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
12379 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[1,1,1,1]
12380 ; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload
12381 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3]
12382 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
12383 ; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm9 # 16-byte Folded Reload
12384 ; AVX2-FCP-NEXT: # xmm9 = xmm11[0],mem[0],xmm11[1],mem[1]
12385 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3]
12386 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
12387 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
12388 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
12389 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
12390 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
12391 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
12392 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
12393 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
12394 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12395 ; AVX2-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
12396 ; AVX2-FCP-NEXT: # xmm4 = mem[1,1,1,1]
12397 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
12398 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm13[1],xmm4[2,3]
12399 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12400 ; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
12401 ; AVX2-FCP-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1]
12402 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
12403 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
12404 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
12405 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
12406 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
12407 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
12408 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
12409 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
12410 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
12411 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12412 ; AVX2-FCP-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
12413 ; AVX2-FCP-NEXT: # xmm0 = mem[2,2,2,2]
12414 ; AVX2-FCP-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12415 ; AVX2-FCP-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
12416 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12417 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
12418 ; AVX2-FCP-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
12419 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12420 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
12421 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
12422 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12423 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12424 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
12425 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12426 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
12427 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
12428 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12429 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12430 ; AVX2-FCP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
12431 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12432 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12433 ; AVX2-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
12434 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12435 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
12436 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
12437 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12438 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12439 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12440 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12441 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,2,2]
12442 ; AVX2-FCP-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12443 ; AVX2-FCP-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
12444 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload
12445 ; AVX2-FCP-NEXT: # xmm1 = xmm14[2],mem[2],xmm14[3],mem[3]
12446 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12447 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
12448 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
12449 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12450 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12451 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
12452 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12453 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
12454 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
12455 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12456 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12457 ; AVX2-FCP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
12458 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12459 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12460 ; AVX2-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
12461 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12462 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
12463 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
12464 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12465 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12466 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12467 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12468 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
12469 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
12470 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
12471 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm12[2],xmm10[2],xmm12[3],xmm10[3]
12472 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3]
12473 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
12474 ; AVX2-FCP-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7]
12475 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
12476 ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
12477 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
12478 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
12479 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,6],ymm0[7]
12480 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
12481 ; AVX2-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
12482 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12483 ; AVX2-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
12484 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
12485 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
12486 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7]
12487 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
12488 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm8[4,5,6,7]
12489 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12490 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
12491 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[2,2,2,2]
12492 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
12493 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3]
12494 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12495 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm10[2],xmm13[2],xmm10[3],xmm13[3]
12496 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm6[2,3]
12497 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
12498 ; AVX2-FCP-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7]
12499 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
12500 ; AVX2-FCP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
12501 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
12502 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
12503 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm6[7]
12504 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
12505 ; AVX2-FCP-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
12506 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
12507 ; AVX2-FCP-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7]
12508 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
12509 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
12510 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
12511 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
12512 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm0[4,5,6,7]
12513 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12514 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
12515 ; AVX2-FCP-NEXT: vpshufd $238, (%rsp), %xmm1 # 16-byte Folded Reload
12516 ; AVX2-FCP-NEXT: # xmm1 = mem[2,3,2,3]
12517 ; AVX2-FCP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
12518 ; AVX2-FCP-NEXT: # xmm2 = mem[3,3,3,3]
12519 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
12520 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12521 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
12522 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
12523 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12524 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
12525 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
12526 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12527 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12528 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12529 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12530 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12531 ; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
12532 ; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
12533 ; AVX2-FCP-NEXT: # xmm2 = mem[2,3,2,3]
12534 ; AVX2-FCP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
12535 ; AVX2-FCP-NEXT: # xmm3 = mem[3,3,3,3]
12536 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3]
12537 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
12538 ; AVX2-FCP-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12539 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
12540 ; AVX2-FCP-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12541 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
12542 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
12543 ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12544 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
12545 ; AVX2-FCP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
12546 ; AVX2-FCP-NEXT: # ymm4 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
12547 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
12548 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
12549 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
12550 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm12[2],xmm9[3],xmm12[3]
12551 ; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
12552 ; AVX2-FCP-NEXT: # xmm3 = mem[2,3,2,3]
12553 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[3,3,3,3]
12554 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3]
12555 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
12556 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
12557 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
12558 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
12559 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
12560 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
12561 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
12562 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
12563 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
12564 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
12565 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
12566 ; AVX2-FCP-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
12567 ; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
12568 ; AVX2-FCP-NEXT: # xmm4 = mem[2,3,2,3]
12569 ; AVX2-FCP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
12570 ; AVX2-FCP-NEXT: # xmm5 = mem[3,3,3,3]
12571 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
12572 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
12573 ; AVX2-FCP-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
12574 ; AVX2-FCP-NEXT: # ymm4 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
12575 ; AVX2-FCP-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
12576 ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
12577 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
12578 ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
12579 ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
12580 ; AVX2-FCP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
12581 ; AVX2-FCP-NEXT: # ymm6 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
12582 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
12583 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
12584 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
12585 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12586 ; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rsi)
12587 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12588 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi)
12589 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12590 ; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rsi)
12591 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12592 ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rsi)
12593 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12594 ; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rdx)
12595 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12596 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rdx)
12597 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12598 ; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rdx)
12599 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12600 ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rdx)
12601 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12602 ; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rcx)
12603 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12604 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rcx)
12605 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12606 ; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rcx)
12607 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12608 ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rcx)
12609 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12610 ; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%r8)
12611 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12612 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%r8)
12613 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12614 ; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%r8)
12615 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12616 ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%r8)
12617 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12618 ; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%r9)
12619 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12620 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%r9)
12621 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12622 ; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%r9)
12623 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12624 ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%r9)
12625 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
12626 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12627 ; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rax)
12628 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12629 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rax)
12630 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12631 ; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rax)
12632 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12633 ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rax)
12634 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
12635 ; AVX2-FCP-NEXT: vmovdqa %ymm15, 64(%rax)
12636 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12637 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rax)
12638 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12639 ; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rax)
12640 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12641 ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rax)
12642 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
12643 ; AVX2-FCP-NEXT: vmovdqa %ymm3, 96(%rax)
12644 ; AVX2-FCP-NEXT: vmovdqa %ymm2, 64(%rax)
12645 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rax)
12646 ; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rax)
12647 ; AVX2-FCP-NEXT: addq $2408, %rsp # imm = 0x968
12648 ; AVX2-FCP-NEXT: vzeroupper
12649 ; AVX2-FCP-NEXT: retq
12651 ; AVX512-LABEL: load_i16_stride8_vf64:
12653 ; AVX512-NEXT: subq $2408, %rsp # imm = 0x968
12654 ; AVX512-NEXT: vmovdqa 368(%rdi), %xmm1
12655 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12656 ; AVX512-NEXT: vmovdqa 352(%rdi), %xmm0
12657 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12658 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
12659 ; AVX512-NEXT: vmovdqa 336(%rdi), %xmm2
12660 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12661 ; AVX512-NEXT: vmovdqa 320(%rdi), %xmm1
12662 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12663 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
12664 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,0,0,4]
12665 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
12666 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12667 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
12668 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
12669 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12670 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
12671 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12672 ; AVX512-NEXT: vpermt2d %xmm0, %xmm6, %xmm1
12673 ; AVX512-NEXT: vmovdqa 304(%rdi), %xmm0
12674 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12675 ; AVX512-NEXT: vmovdqa 288(%rdi), %xmm2
12676 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12677 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
12678 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12679 ; AVX512-NEXT: vmovdqa 272(%rdi), %xmm0
12680 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12681 ; AVX512-NEXT: vmovdqa 256(%rdi), %xmm2
12682 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12683 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
12684 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
12685 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm31
12686 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12687 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
12688 ; AVX512-NEXT: vmovdqa 480(%rdi), %ymm1
12689 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12690 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
12691 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12692 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
12693 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12694 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
12695 ; AVX512-NEXT: vmovdqa 448(%rdi), %ymm2
12696 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12697 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
12698 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12699 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
12700 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12701 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
12702 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12703 ; AVX512-NEXT: vmovdqa 416(%rdi), %ymm2
12704 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12705 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2]
12706 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12707 ; AVX512-NEXT: vmovdqa 384(%rdi), %ymm2
12708 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12709 ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,1,0,2]
12710 ; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12711 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7]
12712 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12713 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
12714 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7]
12715 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12716 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
12717 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12718 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12719 ; AVX512-NEXT: movb $-64, %al
12720 ; AVX512-NEXT: kmovw %eax, %k1
12721 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
12722 ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm2
12723 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12724 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm1
12725 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12726 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
12727 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm3
12728 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12729 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm2
12730 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12731 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
12732 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,2,2]
12733 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
12734 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12735 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
12736 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12737 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
12738 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12739 ; AVX512-NEXT: vpermt2d %xmm1, %xmm6, %xmm2
12740 ; AVX512-NEXT: vmovdqa (%rdi), %xmm1
12741 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12742 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3
12743 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12744 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm4
12745 ; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12746 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm5
12747 ; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12748 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
12749 ; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12750 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
12751 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
12752 ; AVX512-NEXT: vmovdqa64 %xmm3, %xmm17
12753 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
12754 ; AVX512-NEXT: vmovdqa 224(%rdi), %ymm2
12755 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12756 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2]
12757 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12758 ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm2
12759 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12760 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2]
12761 ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12762 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,4]
12763 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12764 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
12765 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,0,4,5,6,4]
12766 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12767 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
12768 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
12769 ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm3
12770 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12771 ; AVX512-NEXT: vpermq {{.*#+}} ymm24 = ymm3[0,1,0,2]
12772 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm3
12773 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12774 ; AVX512-NEXT: vpermq {{.*#+}} ymm22 = ymm3[0,1,0,2]
12775 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm24[0,2,2,3,4,6,6,7]
12776 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12777 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
12778 ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm22[0,2,2,3,4,6,6,7]
12779 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
12780 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
12781 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
12782 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
12783 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
12784 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12785 ; AVX512-NEXT: vmovdqa 880(%rdi), %xmm1
12786 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12787 ; AVX512-NEXT: vmovdqa 864(%rdi), %xmm0
12788 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12789 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
12790 ; AVX512-NEXT: vmovdqa 848(%rdi), %xmm2
12791 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12792 ; AVX512-NEXT: vmovdqa 832(%rdi), %xmm1
12793 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12794 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
12795 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
12796 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
12797 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12798 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
12799 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12800 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
12801 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12802 ; AVX512-NEXT: vpermt2d %xmm0, %xmm6, %xmm1
12803 ; AVX512-NEXT: vmovdqa %xmm6, %xmm10
12804 ; AVX512-NEXT: vmovdqa 816(%rdi), %xmm0
12805 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12806 ; AVX512-NEXT: vmovdqa 800(%rdi), %xmm2
12807 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12808 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
12809 ; AVX512-NEXT: vmovdqa 784(%rdi), %xmm0
12810 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12811 ; AVX512-NEXT: vmovdqa 768(%rdi), %xmm2
12812 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12813 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
12814 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
12815 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm19
12816 ; AVX512-NEXT: vmovdqa64 %xmm3, %xmm20
12817 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12818 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
12819 ; AVX512-NEXT: vmovdqa 992(%rdi), %ymm1
12820 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12821 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,0,2]
12822 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12823 ; AVX512-NEXT: vmovdqa 960(%rdi), %ymm1
12824 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12825 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,2]
12826 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12827 ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,2,0,4,5,6,4]
12828 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
12829 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,2,0,4,5,6,4]
12830 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
12831 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12832 ; AVX512-NEXT: vmovdqa 928(%rdi), %ymm2
12833 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12834 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2]
12835 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12836 ; AVX512-NEXT: vmovdqa 896(%rdi), %ymm2
12837 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12838 ; AVX512-NEXT: vpermq {{.*#+}} ymm18 = ymm2[0,1,0,2]
12839 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
12840 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
12841 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7]
12842 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm9 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
12843 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4],ymm2[5],ymm9[6,7]
12844 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12845 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
12846 ; AVX512-NEXT: vmovdqa 624(%rdi), %xmm2
12847 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12848 ; AVX512-NEXT: vmovdqa 608(%rdi), %xmm1
12849 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12850 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
12851 ; AVX512-NEXT: vmovdqa 592(%rdi), %xmm5
12852 ; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12853 ; AVX512-NEXT: vmovdqa 576(%rdi), %xmm2
12854 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12855 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
12856 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,2,2,2]
12857 ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1,2],xmm9[3]
12858 ; AVX512-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill
12859 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
12860 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
12861 ; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12862 ; AVX512-NEXT: vpermt2d %xmm1, %xmm10, %xmm2
12863 ; AVX512-NEXT: vmovdqa 560(%rdi), %xmm1
12864 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12865 ; AVX512-NEXT: vmovdqa 544(%rdi), %xmm5
12866 ; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12867 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
12868 ; AVX512-NEXT: vmovdqa 528(%rdi), %xmm1
12869 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12870 ; AVX512-NEXT: vmovdqa 512(%rdi), %xmm5
12871 ; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12872 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
12873 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm15[0],xmm8[0],xmm15[1],xmm8[1]
12874 ; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm2[2,3]
12875 ; AVX512-NEXT: vmovdqa 736(%rdi), %ymm1
12876 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12877 ; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2]
12878 ; AVX512-NEXT: vmovdqa 704(%rdi), %ymm1
12879 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12880 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
12881 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12882 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[0,1,2,0,4,5,6,4]
12883 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
12884 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
12885 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
12886 ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5,6],ymm10[7]
12887 ; AVX512-NEXT: vmovdqa 672(%rdi), %ymm5
12888 ; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12889 ; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm5[0,1,0,2]
12890 ; AVX512-NEXT: vmovdqa 640(%rdi), %ymm5
12891 ; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12892 ; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm5[0,1,0,2]
12893 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm28[0,2,2,3,4,6,6,7]
12894 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm14 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
12895 ; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm30[0,2,2,3,4,6,6,7]
12896 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
12897 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm14[5],ymm5[6,7]
12898 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm13[6,7]
12899 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
12900 ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0
12901 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12902 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm31[1,1,1,1]
12903 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
12904 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3]
12905 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12906 ; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
12907 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
12908 ; AVX512-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
12909 ; AVX512-NEXT: # ymm5 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
12910 ; AVX512-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
12911 ; AVX512-NEXT: # ymm9 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
12912 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7]
12913 ; AVX512-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
12914 ; AVX512-NEXT: # ymm9 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
12915 ; AVX512-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
12916 ; AVX512-NEXT: # ymm13 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
12917 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5],ymm13[6,7]
12918 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
12919 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 {%k1}
12920 ; AVX512-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
12921 ; AVX512-NEXT: # ymm5 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
12922 ; AVX512-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
12923 ; AVX512-NEXT: # ymm9 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
12924 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7]
12925 ; AVX512-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
12926 ; AVX512-NEXT: # ymm9 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
12927 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
12928 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5],ymm13[6,7]
12929 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
12930 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm17[1,1,1,1]
12931 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
12932 ; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm13[1],xmm9[2,3]
12933 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
12934 ; AVX512-NEXT: # xmm9 = xmm9[0,1],mem[2,3]
12935 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
12936 ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0
12937 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12938 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm19[1,1,1,1]
12939 ; AVX512-NEXT: vmovdqa64 %xmm20, %xmm9
12940 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3]
12941 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12942 ; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
12943 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
12944 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
12945 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
12946 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
12947 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
12948 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
12949 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
12950 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
12951 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
12952 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,1,1]
12953 ; AVX512-NEXT: vmovdqa %xmm8, %xmm5
12954 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3]
12955 ; AVX512-NEXT: vmovdqa64 %xmm16, %xmm4
12956 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
12957 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
12958 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
12959 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
12960 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
12961 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
12962 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
12963 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12964 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
12965 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
12966 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12967 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm31[2],xmm14[2],xmm31[3],xmm14[3]
12968 ; AVX512-NEXT: vmovdqa64 %xmm14, %xmm16
12969 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12970 ; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
12971 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
12972 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12973 ; AVX512-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
12974 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
12975 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm27
12976 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12977 ; AVX512-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
12978 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
12979 ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm26
12980 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12981 ; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12982 ; AVX512-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
12983 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
12984 ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm23
12985 ; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
12986 ; AVX512-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
12987 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
12988 ; AVX512-NEXT: vmovdqa64 %ymm4, %ymm21
12989 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12990 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12991 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
12992 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12993 ; AVX512-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
12994 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
12995 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm25
12996 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
12997 ; AVX512-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7]
12998 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
12999 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13000 ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm24[3,1,2,3,7,5,6,7]
13001 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
13002 ; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm22[3,1,2,3,7,5,6,7]
13003 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
13004 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
13005 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13006 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm13[2],xmm17[3],xmm13[3]
13007 ; AVX512-NEXT: vmovdqa64 %xmm13, %xmm22
13008 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
13009 ; AVX512-NEXT: # xmm2 = xmm2[0,1],mem[2,3]
13010 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
13011 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
13012 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13013 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm19[2],xmm20[2],xmm19[3],xmm20[3]
13014 ; AVX512-NEXT: vmovdqa64 %xmm20, %xmm24
13015 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13016 ; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
13017 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20
13018 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
13019 ; AVX512-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7]
13020 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
13021 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
13022 ; AVX512-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
13023 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
13024 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
13025 ; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
13026 ; AVX512-NEXT: # ymm8 = mem[3,1,2,3,7,5,6,7]
13027 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
13028 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[3,1,2,3,7,5,6,7]
13029 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
13030 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
13031 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13032 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 {%k1}
13033 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm5[2],xmm15[3],xmm5[3]
13034 ; AVX512-NEXT: vmovdqa64 %xmm5, %xmm18
13035 ; AVX512-NEXT: vpblendd $12, (%rsp), %xmm0, %xmm3 # 16-byte Folded Reload
13036 ; AVX512-NEXT: # xmm3 = xmm0[0,1],mem[2,3]
13037 ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[0,1,1,3,4,5,5,7]
13038 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
13039 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
13040 ; AVX512-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
13041 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
13042 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13043 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm28[3,1,2,3,7,5,6,7]
13044 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
13045 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[3,1,2,3,7,5,6,7]
13046 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
13047 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
13048 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
13049 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
13050 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
13051 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13052 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,7,0,0]
13053 ; AVX512-NEXT: vmovdqa64 %xmm31, %xmm1
13054 ; AVX512-NEXT: vpermt2d %xmm16, %xmm0, %xmm1
13055 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm16
13056 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
13057 ; AVX512-NEXT: # xmm0 = xmm1[0,1],mem[2,3]
13058 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
13059 ; AVX512-NEXT: vmovdqa64 %ymm27, %ymm1
13060 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
13061 ; AVX512-NEXT: vmovdqa64 %ymm26, %ymm3
13062 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
13063 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
13064 ; AVX512-NEXT: vmovdqa64 %ymm23, %ymm3
13065 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
13066 ; AVX512-NEXT: vmovdqa64 %ymm21, %ymm13
13067 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
13068 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7]
13069 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
13070 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
13071 ; AVX512-NEXT: vmovdqa64 %xmm17, %xmm1
13072 ; AVX512-NEXT: vpermt2d %xmm22, %xmm16, %xmm1
13073 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13074 ; AVX512-NEXT: # xmm1 = xmm1[0,1],mem[2,3]
13075 ; AVX512-NEXT: vmovdqa64 %ymm25, %ymm3
13076 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
13077 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm13 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
13078 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7]
13079 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
13080 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
13081 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7]
13082 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5],ymm3[6,7]
13083 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
13084 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
13085 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13086 ; AVX512-NEXT: vmovdqa64 %xmm19, %xmm0
13087 ; AVX512-NEXT: vpermt2d %xmm24, %xmm16, %xmm0
13088 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13089 ; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
13090 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
13091 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
13092 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
13093 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
13094 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
13095 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
13096 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
13097 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
13098 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
13099 ; AVX512-NEXT: vpermt2d %xmm18, %xmm16, %xmm15
13100 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload
13101 ; AVX512-NEXT: # xmm1 = xmm15[0,1],mem[2,3]
13102 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
13103 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
13104 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
13105 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
13106 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
13107 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
13108 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
13109 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
13110 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
13111 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13112 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13113 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
13114 ; AVX512-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
13115 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13116 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
13117 ; AVX512-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
13118 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
13119 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
13120 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13121 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
13122 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13123 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
13124 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13125 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,0,0,4]
13126 ; AVX512-NEXT: vpermt2d %xmm5, %xmm10, %xmm6
13127 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13128 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
13129 ; AVX512-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
13130 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13131 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13132 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
13133 ; AVX512-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
13134 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13135 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm28
13136 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
13137 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
13138 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
13139 ; AVX512-NEXT: # ymm1 = mem[0,1,1,3]
13140 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13141 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
13142 ; AVX512-NEXT: # ymm2 = mem[0,1,1,3]
13143 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13144 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
13145 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13146 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
13147 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
13148 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13149 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
13150 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13151 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
13152 ; AVX512-NEXT: # ymm2 = mem[0,1,1,3]
13153 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13154 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
13155 ; AVX512-NEXT: # ymm3 = mem[0,1,1,3]
13156 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13157 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
13158 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13159 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
13160 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
13161 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13162 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
13163 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
13164 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13165 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
13166 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13167 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13168 ; AVX512-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
13169 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
13170 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
13171 ; AVX512-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
13172 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,2,2]
13173 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
13174 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13175 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13176 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13177 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
13178 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13179 ; AVX512-NEXT: vpermt2d %xmm1, %xmm10, %xmm2
13180 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13181 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
13182 ; AVX512-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
13183 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13184 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13185 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13186 ; AVX512-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
13187 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13188 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
13189 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
13190 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
13191 ; AVX512-NEXT: # ymm2 = mem[0,1,1,3]
13192 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13193 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
13194 ; AVX512-NEXT: # ymm3 = mem[0,1,1,3]
13195 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13196 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
13197 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13198 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
13199 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4]
13200 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
13201 ; AVX512-NEXT: vmovdqa64 %ymm4, %ymm17
13202 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
13203 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
13204 ; AVX512-NEXT: # ymm30 = mem[0,1,1,3]
13205 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
13206 ; AVX512-NEXT: # ymm4 = mem[0,1,1,3]
13207 ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13208 ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm30[0,2,2,3,4,6,6,7]
13209 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
13210 ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,2,2,3,4,6,6,7]
13211 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
13212 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
13213 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
13214 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
13215 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
13216 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13217 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13218 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13219 ; AVX512-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
13220 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13221 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13222 ; AVX512-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
13223 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
13224 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
13225 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13226 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm23 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
13227 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
13228 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13229 ; AVX512-NEXT: vpermt2d %xmm0, %xmm10, %xmm1
13230 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13231 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
13232 ; AVX512-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
13233 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13234 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
13235 ; AVX512-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
13236 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
13237 ; AVX512-NEXT: vmovdqa64 %xmm3, %xmm16
13238 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm20
13239 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13240 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm12
13241 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
13242 ; AVX512-NEXT: # ymm0 = mem[0,1,1,3]
13243 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13244 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
13245 ; AVX512-NEXT: # ymm1 = mem[0,1,1,3]
13246 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13247 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,2,0,4,5,6,4]
13248 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
13249 ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,2,0,4,5,6,4]
13250 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
13251 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
13252 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload
13253 ; AVX512-NEXT: # ymm22 = mem[0,1,1,3]
13254 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
13255 ; AVX512-NEXT: # ymm29 = mem[0,1,1,3]
13256 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm22[0,2,2,3,4,6,6,7]
13257 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
13258 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[0,2,2,3,4,6,6,7]
13259 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
13260 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5],ymm8[6,7]
13261 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13262 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
13263 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13264 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
13265 ; AVX512-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
13266 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13267 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
13268 ; AVX512-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
13269 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13270 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
13271 ; AVX512-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
13272 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13273 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload
13274 ; AVX512-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
13275 ; AVX512-NEXT: vmovdqa %xmm10, %xmm1
13276 ; AVX512-NEXT: vpermi2d %xmm2, %xmm8, %xmm1
13277 ; AVX512-NEXT: vmovdqa64 %xmm8, %xmm19
13278 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm24
13279 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
13280 ; AVX512-NEXT: vmovdqa64 %xmm13, %xmm31
13281 ; AVX512-NEXT: vmovdqa64 %xmm11, %xmm21
13282 ; AVX512-NEXT: vpblendd {{.*#+}} xmm14 = xmm0[0,1],xmm1[2,3]
13283 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
13284 ; AVX512-NEXT: # ymm1 = mem[0,1,1,3]
13285 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13286 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
13287 ; AVX512-NEXT: # ymm0 = mem[0,1,1,3]
13288 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13289 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
13290 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
13291 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,2,0,4,5,6,4]
13292 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
13293 ; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm11[0,1,2,3,4,5,6],ymm8[7]
13294 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
13295 ; AVX512-NEXT: # ymm0 = mem[0,1,1,3]
13296 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13297 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
13298 ; AVX512-NEXT: # ymm10 = mem[0,1,1,3]
13299 ; AVX512-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13300 ; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,2,3,4,6,6,7]
13301 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
13302 ; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,2,2,3,4,6,6,7]
13303 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
13304 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
13305 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
13306 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
13307 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
13308 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13309 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm28[1,1,1,1]
13310 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
13311 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3]
13312 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13313 ; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
13314 ; AVX512-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
13315 ; AVX512-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
13316 ; AVX512-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
13317 ; AVX512-NEXT: # ymm13 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
13318 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
13319 ; AVX512-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
13320 ; AVX512-NEXT: # ymm13 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
13321 ; AVX512-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
13322 ; AVX512-NEXT: # ymm14 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
13323 ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
13324 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
13325 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
13326 ; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 {%k1}
13327 ; AVX512-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
13328 ; AVX512-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
13329 ; AVX512-NEXT: vmovdqa64 %ymm17, %ymm13
13330 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
13331 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
13332 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
13333 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
13334 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7]
13335 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7]
13336 ; AVX512-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload
13337 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm17[1,1,1,1]
13338 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
13339 ; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm15[1],xmm9[2,3]
13340 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
13341 ; AVX512-NEXT: # xmm9 = xmm9[0,1],mem[2,3]
13342 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
13343 ; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0
13344 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13345 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm16[1,1,1,1]
13346 ; AVX512-NEXT: vmovdqa64 %xmm20, %xmm7
13347 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
13348 ; AVX512-NEXT: vmovdqa64 %xmm23, %xmm9
13349 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3]
13350 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
13351 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
13352 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
13353 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
13354 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
13355 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
13356 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
13357 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
13358 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
13359 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm31[1,1,1,1]
13360 ; AVX512-NEXT: vmovdqa64 %xmm21, %xmm6
13361 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3]
13362 ; AVX512-NEXT: vmovdqa64 %xmm19, %xmm5
13363 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm19[0],xmm24[0],xmm19[1],xmm24[1]
13364 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
13365 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
13366 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
13367 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
13368 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
13369 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
13370 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
13371 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13372 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
13373 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
13374 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13375 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm28[2],xmm10[2],xmm28[3],xmm10[3]
13376 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13377 ; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
13378 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
13379 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
13380 ; AVX512-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
13381 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
13382 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm27
13383 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
13384 ; AVX512-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
13385 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
13386 ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm26
13387 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13388 ; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
13389 ; AVX512-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
13390 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
13391 ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm25
13392 ; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
13393 ; AVX512-NEXT: # ymm13 = mem[3,1,2,3,7,5,6,7]
13394 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
13395 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
13396 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13397 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
13398 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
13399 ; AVX512-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
13400 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
13401 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm21
13402 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
13403 ; AVX512-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7]
13404 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
13405 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13406 ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[3,1,2,3,7,5,6,7]
13407 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
13408 ; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
13409 ; AVX512-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
13410 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
13411 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
13412 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13413 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm15[2],xmm17[3],xmm15[3]
13414 ; AVX512-NEXT: vmovdqa64 %xmm15, %xmm23
13415 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
13416 ; AVX512-NEXT: # xmm2 = xmm2[0,1],mem[2,3]
13417 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
13418 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
13419 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13420 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm20[2],xmm16[3],xmm20[3]
13421 ; AVX512-NEXT: vmovdqa64 %xmm16, %xmm19
13422 ; AVX512-NEXT: vmovdqa64 %xmm20, %xmm30
13423 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13424 ; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
13425 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20
13426 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
13427 ; AVX512-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7]
13428 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
13429 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
13430 ; AVX512-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
13431 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
13432 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
13433 ; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm22[3,1,2,3,7,5,6,7]
13434 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
13435 ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[3,1,2,3,7,5,6,7]
13436 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
13437 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
13438 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13439 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 {%k1}
13440 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm24[2,2,2,2]
13441 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
13442 ; AVX512-NEXT: vmovdqa64 %xmm5, %xmm22
13443 ; AVX512-NEXT: vmovdqa64 %xmm31, %xmm4
13444 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm6[2],xmm31[3],xmm6[3]
13445 ; AVX512-NEXT: vmovdqa64 %xmm6, %xmm29
13446 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
13447 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
13448 ; AVX512-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7]
13449 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
13450 ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm18
13451 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
13452 ; AVX512-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
13453 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
13454 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13455 ; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
13456 ; AVX512-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
13457 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
13458 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm31
13459 ; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
13460 ; AVX512-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
13461 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
13462 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
13463 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
13464 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
13465 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
13466 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0]
13467 ; AVX512-NEXT: vmovdqa64 %xmm28, %xmm1
13468 ; AVX512-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm16, %xmm1 # 16-byte Folded Reload
13469 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13470 ; AVX512-NEXT: # xmm1 = xmm1[0,1],mem[2,3]
13471 ; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
13472 ; AVX512-NEXT: vmovdqa64 %ymm27, %ymm3
13473 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
13474 ; AVX512-NEXT: vmovdqa64 %ymm26, %ymm6
13475 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
13476 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5,6],ymm3[7]
13477 ; AVX512-NEXT: vmovdqa64 %ymm25, %ymm6
13478 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
13479 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
13480 ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
13481 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm3[6,7]
13482 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1}
13483 ; AVX512-NEXT: vmovdqa64 %xmm17, %xmm3
13484 ; AVX512-NEXT: vpermt2d %xmm23, %xmm16, %xmm3
13485 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
13486 ; AVX512-NEXT: # xmm3 = xmm3[0,1],mem[2,3]
13487 ; AVX512-NEXT: vmovdqa64 %ymm21, %ymm6
13488 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm13 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
13489 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
13490 ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
13491 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
13492 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
13493 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7]
13494 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7]
13495 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
13496 ; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm1
13497 ; AVX512-NEXT: vmovdqa64 %xmm19, %xmm3
13498 ; AVX512-NEXT: vpermt2d %xmm30, %xmm16, %xmm3
13499 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
13500 ; AVX512-NEXT: # xmm3 = xmm3[0,1],mem[2,3]
13501 ; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3
13502 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
13503 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
13504 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7]
13505 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
13506 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
13507 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5],ymm6[6,7]
13508 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7]
13509 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm3 {%k1}
13510 ; AVX512-NEXT: vpermt2d %xmm29, %xmm16, %xmm4
13511 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm22[2],xmm24[2],xmm22[3],xmm24[3]
13512 ; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3]
13513 ; AVX512-NEXT: vmovdqa64 %ymm18, %ymm7
13514 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
13515 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
13516 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
13517 ; AVX512-NEXT: vmovdqa64 %ymm31, %ymm4
13518 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
13519 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
13520 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
13521 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
13522 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
13523 ; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
13524 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13525 ; AVX512-NEXT: vmovaps %zmm3, 64(%rsi)
13526 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13527 ; AVX512-NEXT: vmovaps %zmm3, (%rsi)
13528 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13529 ; AVX512-NEXT: vmovaps %zmm3, 64(%rdx)
13530 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13531 ; AVX512-NEXT: vmovaps %zmm3, (%rdx)
13532 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13533 ; AVX512-NEXT: vmovaps %zmm3, 64(%rcx)
13534 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13535 ; AVX512-NEXT: vmovaps %zmm3, (%rcx)
13536 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13537 ; AVX512-NEXT: vmovaps %zmm3, 64(%r8)
13538 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13539 ; AVX512-NEXT: vmovaps %zmm3, (%r8)
13540 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13541 ; AVX512-NEXT: vmovaps %zmm3, 64(%r9)
13542 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13543 ; AVX512-NEXT: vmovaps %zmm3, (%r9)
13544 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
13545 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13546 ; AVX512-NEXT: vmovaps %zmm3, 64(%rax)
13547 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13548 ; AVX512-NEXT: vmovaps %zmm3, (%rax)
13549 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
13550 ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax)
13551 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13552 ; AVX512-NEXT: vmovaps %zmm0, (%rax)
13553 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
13554 ; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rax)
13555 ; AVX512-NEXT: vmovdqa64 %zmm1, (%rax)
13556 ; AVX512-NEXT: addq $2408, %rsp # imm = 0x968
13557 ; AVX512-NEXT: vzeroupper
13558 ; AVX512-NEXT: retq
13560 ; AVX512-FCP-LABEL: load_i16_stride8_vf64:
13561 ; AVX512-FCP: # %bb.0:
13562 ; AVX512-FCP-NEXT: subq $2312, %rsp # imm = 0x908
13563 ; AVX512-FCP-NEXT: vmovdqa 368(%rdi), %xmm0
13564 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13565 ; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %xmm1
13566 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13567 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
13568 ; AVX512-FCP-NEXT: vmovdqa 336(%rdi), %xmm0
13569 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13570 ; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %xmm1
13571 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13572 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
13573 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4]
13574 ; AVX512-FCP-NEXT: vmovdqa %xmm14, %xmm0
13575 ; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm5, %xmm0
13576 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm24
13577 ; AVX512-FCP-NEXT: vmovdqa 304(%rdi), %xmm1
13578 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13579 ; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %xmm2
13580 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13581 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
13582 ; AVX512-FCP-NEXT: vmovdqa 272(%rdi), %xmm1
13583 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13584 ; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %xmm2
13585 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13586 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
13587 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
13588 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm30
13589 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13590 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
13591 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
13592 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
13593 ; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %ymm1
13594 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13595 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
13596 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13597 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
13598 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13599 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
13600 ; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm2
13601 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13602 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
13603 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13604 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
13605 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13606 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
13607 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13608 ; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm2
13609 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13610 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2]
13611 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13612 ; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm2
13613 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13614 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2]
13615 ; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13616 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7]
13617 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13618 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
13619 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,2,2,3,4,6,6,7]
13620 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13621 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
13622 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
13623 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13624 ; AVX512-FCP-NEXT: movb $-64, %al
13625 ; AVX512-FCP-NEXT: kmovw %eax, %k1
13626 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
13627 ; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm1
13628 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13629 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm2
13630 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13631 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
13632 ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm1
13633 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13634 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
13635 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13636 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
13637 ; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm1
13638 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm20
13639 ; AVX512-FCP-NEXT: vpermt2d %xmm3, %xmm5, %xmm1
13640 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm19
13641 ; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13642 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2
13643 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13644 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
13645 ; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13646 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm4
13647 ; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13648 ; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm6
13649 ; AVX512-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13650 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
13651 ; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13652 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
13653 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
13654 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm31
13655 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
13656 ; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm2
13657 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13658 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2]
13659 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13660 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm2
13661 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13662 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2]
13663 ; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13664 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
13665 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
13666 ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm28
13667 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[0,1,2,0,4,5,6,4]
13668 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
13669 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
13670 ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm3
13671 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13672 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,1,0,2]
13673 ; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13674 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm3
13675 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13676 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[0,1,0,2]
13677 ; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13678 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,2,2,3,4,6,6,7]
13679 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
13680 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,2,2,3,4,6,6,7]
13681 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
13682 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
13683 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
13684 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
13685 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
13686 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13687 ; AVX512-FCP-NEXT: vmovdqa 880(%rdi), %xmm0
13688 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13689 ; AVX512-FCP-NEXT: vmovdqa 864(%rdi), %xmm1
13690 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13691 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
13692 ; AVX512-FCP-NEXT: vmovdqa 848(%rdi), %xmm0
13693 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13694 ; AVX512-FCP-NEXT: vmovdqa 832(%rdi), %xmm1
13695 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13696 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
13697 ; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm0
13698 ; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm26
13699 ; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm2
13700 ; AVX512-FCP-NEXT: vpermt2d %xmm3, %xmm5, %xmm0
13701 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm27
13702 ; AVX512-FCP-NEXT: vmovdqa 816(%rdi), %xmm1
13703 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13704 ; AVX512-FCP-NEXT: vmovdqa 800(%rdi), %xmm3
13705 ; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13706 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
13707 ; AVX512-FCP-NEXT: vmovdqa 784(%rdi), %xmm1
13708 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13709 ; AVX512-FCP-NEXT: vmovdqa 768(%rdi), %xmm3
13710 ; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13711 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
13712 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
13713 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm23
13714 ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm25
13715 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
13716 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm9
13717 ; AVX512-FCP-NEXT: vmovdqa 992(%rdi), %ymm0
13718 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13719 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
13720 ; AVX512-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
13721 ; AVX512-FCP-NEXT: vmovdqa 960(%rdi), %ymm0
13722 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13723 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,2]
13724 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13725 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,2,0,4,5,6,4]
13726 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
13727 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,4]
13728 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
13729 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
13730 ; AVX512-FCP-NEXT: vmovdqa 928(%rdi), %ymm1
13731 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13732 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,0,2]
13733 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13734 ; AVX512-FCP-NEXT: vmovdqa 896(%rdi), %ymm1
13735 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13736 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm1[0,1,0,2]
13737 ; AVX512-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13738 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
13739 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
13740 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[0,2,2,3,4,6,6,7]
13741 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
13742 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5],ymm12[6,7]
13743 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13744 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm9 {%k1}
13745 ; AVX512-FCP-NEXT: vmovdqa 624(%rdi), %xmm0
13746 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13747 ; AVX512-FCP-NEXT: vmovdqa 608(%rdi), %xmm1
13748 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13749 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
13750 ; AVX512-FCP-NEXT: vmovdqa 592(%rdi), %xmm0
13751 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13752 ; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %xmm1
13753 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13754 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
13755 ; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm0
13756 ; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm29
13757 ; AVX512-FCP-NEXT: vpermt2d %xmm11, %xmm2, %xmm0
13758 ; AVX512-FCP-NEXT: vmovdqa64 %xmm11, %xmm16
13759 ; AVX512-FCP-NEXT: vmovdqa 560(%rdi), %xmm1
13760 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13761 ; AVX512-FCP-NEXT: vmovdqa 544(%rdi), %xmm2
13762 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13763 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
13764 ; AVX512-FCP-NEXT: vmovdqa 528(%rdi), %xmm1
13765 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13766 ; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %xmm2
13767 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13768 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
13769 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm11[0],xmm2[1],xmm11[1]
13770 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm22
13771 ; AVX512-FCP-NEXT: vmovdqa64 %xmm11, %xmm21
13772 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3]
13773 ; AVX512-FCP-NEXT: vmovdqa 736(%rdi), %ymm0
13774 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13775 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
13776 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13777 ; AVX512-FCP-NEXT: vmovdqa 704(%rdi), %ymm0
13778 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13779 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2]
13780 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13781 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
13782 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
13783 ; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm18
13784 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
13785 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
13786 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm0[7]
13787 ; AVX512-FCP-NEXT: vmovdqa 672(%rdi), %ymm0
13788 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13789 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
13790 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13791 ; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm0
13792 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13793 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm0[0,1,0,2]
13794 ; AVX512-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13795 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,2,2,3,4,6,6,7]
13796 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
13797 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,2,2,3,4,6,6,7]
13798 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
13799 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7]
13800 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7]
13801 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
13802 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
13803 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13804 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [1,5,0,0]
13805 ; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm0
13806 ; AVX512-FCP-NEXT: vpermt2d %xmm17, %xmm13, %xmm0
13807 ; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm30
13808 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm14[0],xmm24[0],xmm14[1],xmm24[1]
13809 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3]
13810 ; AVX512-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
13811 ; AVX512-FCP-NEXT: # ymm9 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
13812 ; AVX512-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
13813 ; AVX512-FCP-NEXT: # ymm11 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
13814 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
13815 ; AVX512-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
13816 ; AVX512-FCP-NEXT: # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
13817 ; AVX512-FCP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
13818 ; AVX512-FCP-NEXT: # ymm12 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
13819 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
13820 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
13821 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
13822 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 {%k1}
13823 ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm12
13824 ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm9
13825 ; AVX512-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload
13826 ; AVX512-FCP-NEXT: vpermt2d %xmm17, %xmm13, %xmm9
13827 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm20[0],xmm19[0],xmm20[1],xmm19[1]
13828 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3]
13829 ; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm11
13830 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
13831 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
13832 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
13833 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
13834 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
13835 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
13836 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7]
13837 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
13838 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0
13839 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13840 ; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm9
13841 ; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm0
13842 ; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm11
13843 ; AVX512-FCP-NEXT: vpermt2d %xmm25, %xmm13, %xmm0
13844 ; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm8
13845 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm26[0],xmm27[0],xmm26[1],xmm27[1]
13846 ; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm19
13847 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
13848 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
13849 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
13850 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
13851 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
13852 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
13853 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
13854 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
13855 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
13856 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
13857 ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm3
13858 ; AVX512-FCP-NEXT: vpermt2d %xmm21, %xmm13, %xmm3
13859 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm29[0],xmm16[0],xmm29[1],xmm16[1]
13860 ; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm6
13861 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
13862 ; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm4
13863 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
13864 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
13865 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
13866 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
13867 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
13868 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
13869 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
13870 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
13871 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
13872 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13873 ; AVX512-FCP-NEXT: vmovdqa %xmm14, %xmm0
13874 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm24[2],xmm14[3],xmm24[3]
13875 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13876 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6]
13877 ; AVX512-FCP-NEXT: vpermt2d %xmm24, %xmm5, %xmm0
13878 ; AVX512-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload
13879 ; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm28
13880 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm30[2],xmm31[3],xmm30[3]
13881 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
13882 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
13883 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
13884 ; AVX512-FCP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
13885 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13886 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
13887 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
13888 ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
13889 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
13890 ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm29
13891 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13892 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
13893 ; AVX512-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
13894 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
13895 ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
13896 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
13897 ; AVX512-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
13898 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
13899 ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm27
13900 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
13901 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13902 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
13903 ; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm1
13904 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
13905 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm26 = xmm20[2],xmm2[2],xmm20[3],xmm2[3]
13906 ; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm5, %xmm1
13907 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm17[2],xmm12[3],xmm17[3]
13908 ; AVX512-FCP-NEXT: vmovdqa64 %xmm12, %xmm25
13909 ; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm23
13910 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
13911 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
13912 ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
13913 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
13914 ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm18
13915 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
13916 ; AVX512-FCP-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7]
13917 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
13918 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
13919 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
13920 ; AVX512-FCP-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7]
13921 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
13922 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
13923 ; AVX512-FCP-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
13924 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
13925 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
13926 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
13927 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
13928 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
13929 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13930 ; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm0
13931 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm17 = xmm19[2],xmm8[2],xmm19[3],xmm8[3]
13932 ; AVX512-FCP-NEXT: vpermt2d %xmm8, %xmm5, %xmm0
13933 ; AVX512-FCP-NEXT: vmovdqa64 %xmm9, %xmm19
13934 ; AVX512-FCP-NEXT: vmovdqa64 %xmm11, %xmm20
13935 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
13936 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
13937 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm24
13938 ; AVX512-FCP-NEXT: vpshufd $212, (%rsp), %ymm9 # 32-byte Folded Reload
13939 ; AVX512-FCP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
13940 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
13941 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
13942 ; AVX512-FCP-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7]
13943 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
13944 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
13945 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
13946 ; AVX512-FCP-NEXT: # ymm8 = mem[3,1,2,3,7,5,6,7]
13947 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
13948 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
13949 ; AVX512-FCP-NEXT: # ymm7 = mem[3,1,2,3,7,5,6,7]
13950 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
13951 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
13952 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13953 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm24 {%k1}
13954 ; AVX512-FCP-NEXT: vmovdqa %xmm6, %xmm0
13955 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm6[2],xmm16[2],xmm6[3],xmm16[3]
13956 ; AVX512-FCP-NEXT: vpermt2d %xmm16, %xmm5, %xmm0
13957 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm22[2],xmm21[2],xmm22[3],xmm21[3]
13958 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
13959 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
13960 ; AVX512-FCP-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7]
13961 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
13962 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
13963 ; AVX512-FCP-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
13964 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
13965 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13966 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
13967 ; AVX512-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
13968 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
13969 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
13970 ; AVX512-FCP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
13971 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
13972 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
13973 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
13974 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
13975 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
13976 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13977 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0]
13978 ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm1
13979 ; AVX512-FCP-NEXT: vpermt2d %xmm28, %xmm16, %xmm1
13980 ; AVX512-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
13981 ; AVX512-FCP-NEXT: # xmm0 = xmm1[0,1],mem[2,3]
13982 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
13983 ; AVX512-FCP-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
13984 ; AVX512-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
13985 ; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm3
13986 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
13987 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
13988 ; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm3
13989 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
13990 ; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm15
13991 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
13992 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7]
13993 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
13994 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
13995 ; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm1
13996 ; AVX512-FCP-NEXT: vpermt2d %xmm23, %xmm16, %xmm1
13997 ; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm3
13998 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
13999 ; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm3
14000 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
14001 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
14002 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7]
14003 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
14004 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
14005 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
14006 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7]
14007 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
14008 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
14009 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14010 ; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm0
14011 ; AVX512-FCP-NEXT: vpermt2d %xmm20, %xmm16, %xmm0
14012 ; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm1
14013 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14014 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14015 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
14016 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
14017 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
14018 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
14019 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
14020 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7]
14021 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
14022 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
14023 ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm1
14024 ; AVX512-FCP-NEXT: vpermt2d %xmm21, %xmm16, %xmm1
14025 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
14026 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
14027 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
14028 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
14029 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
14030 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
14031 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
14032 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
14033 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
14034 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
14035 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14036 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14037 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
14038 ; AVX512-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14039 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14040 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
14041 ; AVX512-FCP-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14042 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14043 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
14044 ; AVX512-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14045 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14046 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
14047 ; AVX512-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14048 ; AVX512-FCP-NEXT: vmovdqa %xmm9, %xmm0
14049 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4]
14050 ; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm5, %xmm0
14051 ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm29
14052 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
14053 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm23
14054 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm19
14055 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
14056 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14057 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
14058 ; AVX512-FCP-NEXT: # ymm1 = mem[0,1,1,3]
14059 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14060 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
14061 ; AVX512-FCP-NEXT: # ymm2 = mem[0,1,1,3]
14062 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14063 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
14064 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14065 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
14066 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
14067 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14068 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
14069 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
14070 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
14071 ; AVX512-FCP-NEXT: # ymm2 = mem[0,1,1,3]
14072 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14073 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
14074 ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,1,3]
14075 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14076 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
14077 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14078 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
14079 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
14080 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14081 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
14082 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
14083 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14084 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
14085 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14086 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload
14087 ; AVX512-FCP-NEXT: # xmm6 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
14088 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14089 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
14090 ; AVX512-FCP-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
14091 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14092 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
14093 ; AVX512-FCP-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
14094 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14095 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
14096 ; AVX512-FCP-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
14097 ; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm1
14098 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm27
14099 ; AVX512-FCP-NEXT: vpermt2d %xmm6, %xmm5, %xmm1
14100 ; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm18
14101 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
14102 ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm22
14103 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm24
14104 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
14105 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
14106 ; AVX512-FCP-NEXT: # ymm2 = mem[0,1,1,3]
14107 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14108 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
14109 ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,1,3]
14110 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14111 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
14112 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14113 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
14114 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4]
14115 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
14116 ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm26
14117 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
14118 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
14119 ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,1,3]
14120 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14121 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
14122 ; AVX512-FCP-NEXT: # ymm4 = mem[0,1,1,3]
14123 ; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14124 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[0,2,2,3,4,6,6,7]
14125 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
14126 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[0,2,2,3,4,6,6,7]
14127 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
14128 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
14129 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
14130 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
14131 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
14132 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14133 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14134 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
14135 ; AVX512-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14136 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14137 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
14138 ; AVX512-FCP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14139 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14140 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
14141 ; AVX512-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14142 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14143 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
14144 ; AVX512-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14145 ; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm0
14146 ; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm31
14147 ; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm5, %xmm0
14148 ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm20
14149 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
14150 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm21
14151 ; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14152 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm25
14153 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
14154 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm12
14155 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
14156 ; AVX512-FCP-NEXT: # ymm0 = mem[0,1,1,3]
14157 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14158 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
14159 ; AVX512-FCP-NEXT: # ymm1 = mem[0,1,1,3]
14160 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14161 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,1,2,0,4,5,6,4]
14162 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
14163 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,0,4,5,6,4]
14164 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
14165 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
14166 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
14167 ; AVX512-FCP-NEXT: # ymm1 = mem[0,1,1,3]
14168 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14169 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
14170 ; AVX512-FCP-NEXT: # ymm2 = mem[0,1,1,3]
14171 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14172 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[0,2,2,3,4,6,6,7]
14173 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
14174 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,2,2,3,4,6,6,7]
14175 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
14176 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
14177 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14178 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
14179 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14180 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
14181 ; AVX512-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14182 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14183 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
14184 ; AVX512-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14185 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14186 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload
14187 ; AVX512-FCP-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14188 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14189 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
14190 ; AVX512-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14191 ; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm1
14192 ; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm1
14193 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
14194 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm16
14195 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm13[0],xmm4[1],xmm13[1]
14196 ; AVX512-FCP-NEXT: vmovdqa64 %xmm13, %xmm28
14197 ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm30
14198 ; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14199 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm1[2,3]
14200 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
14201 ; AVX512-FCP-NEXT: # ymm0 = mem[0,1,1,3]
14202 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14203 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
14204 ; AVX512-FCP-NEXT: # ymm1 = mem[0,1,1,3]
14205 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14206 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,0,4,5,6,4]
14207 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
14208 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,2,0,4,5,6,4]
14209 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
14210 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7]
14211 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
14212 ; AVX512-FCP-NEXT: # ymm0 = mem[0,1,1,3]
14213 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14214 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
14215 ; AVX512-FCP-NEXT: # ymm2 = mem[0,1,1,3]
14216 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14217 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
14218 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
14219 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
14220 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
14221 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
14222 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
14223 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
14224 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
14225 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14226 ; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm0
14227 ; AVX512-FCP-NEXT: vmovdqa64 %xmm23, (%rsp) # 16-byte Spill
14228 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0]
14229 ; AVX512-FCP-NEXT: vpermt2d %xmm19, %xmm15, %xmm0
14230 ; AVX512-FCP-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14231 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm29[0],xmm9[1],xmm29[1]
14232 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14233 ; AVX512-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
14234 ; AVX512-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
14235 ; AVX512-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
14236 ; AVX512-FCP-NEXT: # ymm12 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
14237 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7]
14238 ; AVX512-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
14239 ; AVX512-FCP-NEXT: # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
14240 ; AVX512-FCP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
14241 ; AVX512-FCP-NEXT: # ymm13 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
14242 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7]
14243 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14244 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7]
14245 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
14246 ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm1
14247 ; AVX512-FCP-NEXT: vpermt2d %xmm24, %xmm15, %xmm1
14248 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm27[0],xmm18[0],xmm27[1],xmm18[1]
14249 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3]
14250 ; AVX512-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
14251 ; AVX512-FCP-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
14252 ; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm13
14253 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
14254 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
14255 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
14256 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
14257 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7]
14258 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm12[6,7]
14259 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
14260 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
14261 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14262 ; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm0
14263 ; AVX512-FCP-NEXT: vpermt2d %xmm25, %xmm15, %xmm0
14264 ; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm13
14265 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm31[0],xmm20[0],xmm31[1],xmm20[1]
14266 ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm21
14267 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14268 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
14269 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
14270 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7]
14271 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
14272 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
14273 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
14274 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
14275 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14276 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
14277 ; AVX512-FCP-NEXT: vmovdqa %xmm15, %xmm6
14278 ; AVX512-FCP-NEXT: vpermi2d %xmm28, %xmm30, %xmm6
14279 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm16[0],xmm17[1],xmm16[1]
14280 ; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm7
14281 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
14282 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
14283 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
14284 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
14285 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
14286 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
14287 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
14288 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
14289 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
14290 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
14291 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14292 ; AVX512-FCP-NEXT: vmovdqa %xmm9, %xmm0
14293 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm29[2],xmm9[3],xmm29[3]
14294 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14295 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6]
14296 ; AVX512-FCP-NEXT: vpermt2d %xmm29, %xmm5, %xmm0
14297 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm23[2],xmm19[2],xmm23[3],xmm19[3]
14298 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
14299 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14300 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
14301 ; AVX512-FCP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
14302 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
14303 ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm31
14304 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
14305 ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
14306 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
14307 ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
14308 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
14309 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
14310 ; AVX512-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
14311 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
14312 ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm26
14313 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
14314 ; AVX512-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
14315 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
14316 ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm23
14317 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
14318 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14319 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
14320 ; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm1
14321 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm20 = xmm27[2],xmm18[2],xmm27[3],xmm18[3]
14322 ; AVX512-FCP-NEXT: vpermt2d %xmm18, %xmm5, %xmm1
14323 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm22[2],xmm24[2],xmm22[3],xmm24[3]
14324 ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm19
14325 ; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm18
14326 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
14327 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
14328 ; AVX512-FCP-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7]
14329 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
14330 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
14331 ; AVX512-FCP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
14332 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
14333 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
14334 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
14335 ; AVX512-FCP-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
14336 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
14337 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
14338 ; AVX512-FCP-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
14339 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
14340 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
14341 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
14342 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
14343 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
14344 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14345 ; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm0
14346 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm21[2],xmm13[2],xmm21[3],xmm13[3]
14347 ; AVX512-FCP-NEXT: vpermt2d %xmm13, %xmm5, %xmm0
14348 ; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm22
14349 ; AVX512-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm24 # 16-byte Reload
14350 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm24[2],xmm25[2],xmm24[3],xmm25[3]
14351 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
14352 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm21
14353 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
14354 ; AVX512-FCP-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7]
14355 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
14356 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
14357 ; AVX512-FCP-NEXT: # ymm15 = mem[0,1,1,3,4,5,5,7]
14358 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
14359 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
14360 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
14361 ; AVX512-FCP-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7]
14362 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
14363 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
14364 ; AVX512-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
14365 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
14366 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
14367 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14368 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm21 {%k1}
14369 ; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm1
14370 ; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm25
14371 ; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm27
14372 ; AVX512-FCP-NEXT: vpermi2d %xmm7, %xmm17, %xmm1
14373 ; AVX512-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
14374 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm29[2],xmm28[2],xmm29[3],xmm28[3]
14375 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3]
14376 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
14377 ; AVX512-FCP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
14378 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
14379 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
14380 ; AVX512-FCP-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7]
14381 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
14382 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
14383 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
14384 ; AVX512-FCP-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7]
14385 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
14386 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
14387 ; AVX512-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
14388 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
14389 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
14390 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
14391 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
14392 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
14393 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0]
14394 ; AVX512-FCP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
14395 ; AVX512-FCP-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm17, %xmm0 # 16-byte Folded Reload
14396 ; AVX512-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
14397 ; AVX512-FCP-NEXT: # xmm1 = xmm0[0,1],mem[2,3]
14398 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
14399 ; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm0
14400 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
14401 ; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm0
14402 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm0[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
14403 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm2[7]
14404 ; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm0
14405 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
14406 ; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm0
14407 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
14408 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7]
14409 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
14410 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
14411 ; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm0
14412 ; AVX512-FCP-NEXT: vpermt2d %xmm18, %xmm17, %xmm0
14413 ; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm2
14414 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
14415 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
14416 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
14417 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7]
14418 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
14419 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
14420 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
14421 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7]
14422 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
14423 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
14424 ; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm1
14425 ; AVX512-FCP-NEXT: vpermt2d %xmm22, %xmm17, %xmm1
14426 ; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm2
14427 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
14428 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
14429 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
14430 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm15[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
14431 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7]
14432 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
14433 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
14434 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7]
14435 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
14436 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 {%k1}
14437 ; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm4
14438 ; AVX512-FCP-NEXT: vpermt2d %xmm28, %xmm17, %xmm4
14439 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm27[2],xmm25[2],xmm27[3],xmm25[3]
14440 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
14441 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
14442 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
14443 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
14444 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
14445 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
14446 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7]
14447 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
14448 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
14449 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
14450 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14451 ; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rsi)
14452 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14453 ; AVX512-FCP-NEXT: vmovaps %zmm2, (%rsi)
14454 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14455 ; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rdx)
14456 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14457 ; AVX512-FCP-NEXT: vmovaps %zmm2, (%rdx)
14458 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14459 ; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rcx)
14460 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14461 ; AVX512-FCP-NEXT: vmovaps %zmm2, (%rcx)
14462 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14463 ; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%r8)
14464 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14465 ; AVX512-FCP-NEXT: vmovaps %zmm2, (%r8)
14466 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14467 ; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%r9)
14468 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14469 ; AVX512-FCP-NEXT: vmovaps %zmm2, (%r9)
14470 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
14471 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14472 ; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rax)
14473 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14474 ; AVX512-FCP-NEXT: vmovaps %zmm2, (%rax)
14475 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
14476 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 64(%rax)
14477 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14478 ; AVX512-FCP-NEXT: vmovaps %zmm2, (%rax)
14479 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
14480 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax)
14481 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
14482 ; AVX512-FCP-NEXT: addq $2312, %rsp # imm = 0x908
14483 ; AVX512-FCP-NEXT: vzeroupper
14484 ; AVX512-FCP-NEXT: retq
14486 ; AVX512DQ-LABEL: load_i16_stride8_vf64:
14487 ; AVX512DQ: # %bb.0:
14488 ; AVX512DQ-NEXT: subq $2408, %rsp # imm = 0x968
14489 ; AVX512DQ-NEXT: vmovdqa 368(%rdi), %xmm1
14490 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14491 ; AVX512DQ-NEXT: vmovdqa 352(%rdi), %xmm0
14492 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14493 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
14494 ; AVX512DQ-NEXT: vmovdqa 336(%rdi), %xmm2
14495 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14496 ; AVX512DQ-NEXT: vmovdqa 320(%rdi), %xmm1
14497 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14498 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
14499 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,0,0,4]
14500 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
14501 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14502 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
14503 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
14504 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14505 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
14506 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14507 ; AVX512DQ-NEXT: vpermt2d %xmm0, %xmm6, %xmm1
14508 ; AVX512DQ-NEXT: vmovdqa 304(%rdi), %xmm0
14509 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14510 ; AVX512DQ-NEXT: vmovdqa 288(%rdi), %xmm2
14511 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14512 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
14513 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14514 ; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm0
14515 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14516 ; AVX512DQ-NEXT: vmovdqa 256(%rdi), %xmm2
14517 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14518 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
14519 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
14520 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm31
14521 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14522 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14523 ; AVX512DQ-NEXT: vmovdqa 480(%rdi), %ymm1
14524 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14525 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
14526 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14527 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
14528 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14529 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
14530 ; AVX512DQ-NEXT: vmovdqa 448(%rdi), %ymm2
14531 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14532 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
14533 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14534 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
14535 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14536 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
14537 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
14538 ; AVX512DQ-NEXT: vmovdqa 416(%rdi), %ymm2
14539 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14540 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2]
14541 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14542 ; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm2
14543 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14544 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,1,0,2]
14545 ; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14546 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7]
14547 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14548 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
14549 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7]
14550 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14551 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
14552 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
14553 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14554 ; AVX512DQ-NEXT: movb $-64, %al
14555 ; AVX512DQ-NEXT: kmovw %eax, %k1
14556 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
14557 ; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm2
14558 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14559 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm1
14560 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14561 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
14562 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm3
14563 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14564 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm2
14565 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14566 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
14567 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,2,2]
14568 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
14569 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14570 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14571 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14572 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
14573 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14574 ; AVX512DQ-NEXT: vpermt2d %xmm1, %xmm6, %xmm2
14575 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
14576 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14577 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm3
14578 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14579 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm4
14580 ; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14581 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm5
14582 ; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14583 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
14584 ; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14585 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
14586 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
14587 ; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm17
14588 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
14589 ; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm2
14590 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14591 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2]
14592 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14593 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm2
14594 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14595 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2]
14596 ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14597 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,4]
14598 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14599 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
14600 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,0,4,5,6,4]
14601 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14602 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
14603 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
14604 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm3
14605 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14606 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm24 = ymm3[0,1,0,2]
14607 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm3
14608 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14609 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm22 = ymm3[0,1,0,2]
14610 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm24[0,2,2,3,4,6,6,7]
14611 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14612 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
14613 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm22[0,2,2,3,4,6,6,7]
14614 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
14615 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
14616 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
14617 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
14618 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
14619 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14620 ; AVX512DQ-NEXT: vmovdqa 880(%rdi), %xmm1
14621 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14622 ; AVX512DQ-NEXT: vmovdqa 864(%rdi), %xmm0
14623 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14624 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
14625 ; AVX512DQ-NEXT: vmovdqa 848(%rdi), %xmm2
14626 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14627 ; AVX512DQ-NEXT: vmovdqa 832(%rdi), %xmm1
14628 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14629 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
14630 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
14631 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
14632 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14633 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
14634 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14635 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
14636 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14637 ; AVX512DQ-NEXT: vpermt2d %xmm0, %xmm6, %xmm1
14638 ; AVX512DQ-NEXT: vmovdqa %xmm6, %xmm10
14639 ; AVX512DQ-NEXT: vmovdqa 816(%rdi), %xmm0
14640 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14641 ; AVX512DQ-NEXT: vmovdqa 800(%rdi), %xmm2
14642 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14643 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
14644 ; AVX512DQ-NEXT: vmovdqa 784(%rdi), %xmm0
14645 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14646 ; AVX512DQ-NEXT: vmovdqa 768(%rdi), %xmm2
14647 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14648 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
14649 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
14650 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm19
14651 ; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm20
14652 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14653 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14654 ; AVX512DQ-NEXT: vmovdqa 992(%rdi), %ymm1
14655 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14656 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,0,2]
14657 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14658 ; AVX512DQ-NEXT: vmovdqa 960(%rdi), %ymm1
14659 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14660 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,2]
14661 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14662 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,2,0,4,5,6,4]
14663 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
14664 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,2,0,4,5,6,4]
14665 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
14666 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
14667 ; AVX512DQ-NEXT: vmovdqa 928(%rdi), %ymm2
14668 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14669 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2]
14670 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14671 ; AVX512DQ-NEXT: vmovdqa 896(%rdi), %ymm2
14672 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14673 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm18 = ymm2[0,1,0,2]
14674 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
14675 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
14676 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7]
14677 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm9 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
14678 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4],ymm2[5],ymm9[6,7]
14679 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14680 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
14681 ; AVX512DQ-NEXT: vmovdqa 624(%rdi), %xmm2
14682 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14683 ; AVX512DQ-NEXT: vmovdqa 608(%rdi), %xmm1
14684 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14685 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
14686 ; AVX512DQ-NEXT: vmovdqa 592(%rdi), %xmm5
14687 ; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14688 ; AVX512DQ-NEXT: vmovdqa 576(%rdi), %xmm2
14689 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14690 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
14691 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,2,2,2]
14692 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1,2],xmm9[3]
14693 ; AVX512DQ-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill
14694 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14695 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
14696 ; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14697 ; AVX512DQ-NEXT: vpermt2d %xmm1, %xmm10, %xmm2
14698 ; AVX512DQ-NEXT: vmovdqa 560(%rdi), %xmm1
14699 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14700 ; AVX512DQ-NEXT: vmovdqa 544(%rdi), %xmm5
14701 ; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14702 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
14703 ; AVX512DQ-NEXT: vmovdqa 528(%rdi), %xmm1
14704 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14705 ; AVX512DQ-NEXT: vmovdqa 512(%rdi), %xmm5
14706 ; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14707 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
14708 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm15[0],xmm8[0],xmm15[1],xmm8[1]
14709 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm2[2,3]
14710 ; AVX512DQ-NEXT: vmovdqa 736(%rdi), %ymm1
14711 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14712 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2]
14713 ; AVX512DQ-NEXT: vmovdqa 704(%rdi), %ymm1
14714 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14715 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
14716 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14717 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[0,1,2,0,4,5,6,4]
14718 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
14719 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
14720 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
14721 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5,6],ymm10[7]
14722 ; AVX512DQ-NEXT: vmovdqa 672(%rdi), %ymm5
14723 ; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14724 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm5[0,1,0,2]
14725 ; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm5
14726 ; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14727 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm5[0,1,0,2]
14728 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm28[0,2,2,3,4,6,6,7]
14729 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm14 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
14730 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm30[0,2,2,3,4,6,6,7]
14731 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
14732 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm14[5],ymm5[6,7]
14733 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm13[6,7]
14734 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
14735 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0
14736 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14737 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm31[1,1,1,1]
14738 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
14739 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3]
14740 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14741 ; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
14742 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14743 ; AVX512DQ-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
14744 ; AVX512DQ-NEXT: # ymm5 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
14745 ; AVX512DQ-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
14746 ; AVX512DQ-NEXT: # ymm9 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
14747 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7]
14748 ; AVX512DQ-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
14749 ; AVX512DQ-NEXT: # ymm9 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
14750 ; AVX512DQ-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
14751 ; AVX512DQ-NEXT: # ymm13 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
14752 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5],ymm13[6,7]
14753 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
14754 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 {%k1}
14755 ; AVX512DQ-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
14756 ; AVX512DQ-NEXT: # ymm5 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
14757 ; AVX512DQ-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
14758 ; AVX512DQ-NEXT: # ymm9 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
14759 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7]
14760 ; AVX512DQ-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
14761 ; AVX512DQ-NEXT: # ymm9 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
14762 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
14763 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5],ymm13[6,7]
14764 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
14765 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm17[1,1,1,1]
14766 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
14767 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm13[1],xmm9[2,3]
14768 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
14769 ; AVX512DQ-NEXT: # xmm9 = xmm9[0,1],mem[2,3]
14770 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
14771 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0
14772 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14773 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm19[1,1,1,1]
14774 ; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm9
14775 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3]
14776 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14777 ; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
14778 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
14779 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
14780 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
14781 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
14782 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
14783 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
14784 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
14785 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14786 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
14787 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,1,1]
14788 ; AVX512DQ-NEXT: vmovdqa %xmm8, %xmm5
14789 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3]
14790 ; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm4
14791 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
14792 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
14793 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
14794 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
14795 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
14796 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
14797 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
14798 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14799 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
14800 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
14801 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14802 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm31[2],xmm14[2],xmm31[3],xmm14[3]
14803 ; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm16
14804 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14805 ; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
14806 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14807 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
14808 ; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
14809 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
14810 ; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm27
14811 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
14812 ; AVX512DQ-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
14813 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
14814 ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm26
14815 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
14816 ; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
14817 ; AVX512DQ-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
14818 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
14819 ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm23
14820 ; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
14821 ; AVX512DQ-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
14822 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
14823 ; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm21
14824 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
14825 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14826 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
14827 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
14828 ; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
14829 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
14830 ; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm25
14831 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
14832 ; AVX512DQ-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7]
14833 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
14834 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
14835 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm24[3,1,2,3,7,5,6,7]
14836 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
14837 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm22[3,1,2,3,7,5,6,7]
14838 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
14839 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
14840 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14841 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm13[2],xmm17[3],xmm13[3]
14842 ; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm22
14843 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
14844 ; AVX512DQ-NEXT: # xmm2 = xmm2[0,1],mem[2,3]
14845 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
14846 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
14847 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14848 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm19[2],xmm20[2],xmm19[3],xmm20[3]
14849 ; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm24
14850 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14851 ; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
14852 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20
14853 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
14854 ; AVX512DQ-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7]
14855 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
14856 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
14857 ; AVX512DQ-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
14858 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
14859 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
14860 ; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
14861 ; AVX512DQ-NEXT: # ymm8 = mem[3,1,2,3,7,5,6,7]
14862 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
14863 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[3,1,2,3,7,5,6,7]
14864 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
14865 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
14866 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14867 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 {%k1}
14868 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm5[2],xmm15[3],xmm5[3]
14869 ; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm18
14870 ; AVX512DQ-NEXT: vpblendd $12, (%rsp), %xmm0, %xmm3 # 16-byte Folded Reload
14871 ; AVX512DQ-NEXT: # xmm3 = xmm0[0,1],mem[2,3]
14872 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[0,1,1,3,4,5,5,7]
14873 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
14874 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
14875 ; AVX512DQ-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
14876 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
14877 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
14878 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm28[3,1,2,3,7,5,6,7]
14879 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
14880 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[3,1,2,3,7,5,6,7]
14881 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
14882 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
14883 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
14884 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
14885 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
14886 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14887 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,7,0,0]
14888 ; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm1
14889 ; AVX512DQ-NEXT: vpermt2d %xmm16, %xmm0, %xmm1
14890 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm16
14891 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
14892 ; AVX512DQ-NEXT: # xmm0 = xmm1[0,1],mem[2,3]
14893 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14894 ; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm1
14895 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
14896 ; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm3
14897 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
14898 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
14899 ; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm3
14900 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
14901 ; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm13
14902 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
14903 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7]
14904 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
14905 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
14906 ; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm1
14907 ; AVX512DQ-NEXT: vpermt2d %xmm22, %xmm16, %xmm1
14908 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14909 ; AVX512DQ-NEXT: # xmm1 = xmm1[0,1],mem[2,3]
14910 ; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm3
14911 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
14912 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm13 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
14913 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7]
14914 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
14915 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
14916 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7]
14917 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5],ymm3[6,7]
14918 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
14919 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
14920 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14921 ; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm0
14922 ; AVX512DQ-NEXT: vpermt2d %xmm24, %xmm16, %xmm0
14923 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14924 ; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
14925 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14926 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
14927 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
14928 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
14929 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
14930 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
14931 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
14932 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
14933 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
14934 ; AVX512DQ-NEXT: vpermt2d %xmm18, %xmm16, %xmm15
14935 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload
14936 ; AVX512DQ-NEXT: # xmm1 = xmm15[0,1],mem[2,3]
14937 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
14938 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
14939 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
14940 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
14941 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
14942 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
14943 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
14944 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
14945 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
14946 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14947 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14948 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
14949 ; AVX512DQ-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14950 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14951 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
14952 ; AVX512DQ-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14953 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
14954 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
14955 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14956 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
14957 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14958 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
14959 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14960 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,0,0,4]
14961 ; AVX512DQ-NEXT: vpermt2d %xmm5, %xmm10, %xmm6
14962 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14963 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
14964 ; AVX512DQ-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14965 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14966 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14967 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
14968 ; AVX512DQ-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14969 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14970 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm28
14971 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
14972 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14973 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
14974 ; AVX512DQ-NEXT: # ymm1 = mem[0,1,1,3]
14975 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14976 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
14977 ; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3]
14978 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14979 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
14980 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14981 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
14982 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
14983 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14984 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
14985 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
14986 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
14987 ; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3]
14988 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14989 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
14990 ; AVX512DQ-NEXT: # ymm3 = mem[0,1,1,3]
14991 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14992 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
14993 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14994 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
14995 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
14996 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14997 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
14998 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
14999 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
15000 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
15001 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15002 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
15003 ; AVX512DQ-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
15004 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
15005 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
15006 ; AVX512DQ-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
15007 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,2,2]
15008 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
15009 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15010 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
15011 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15012 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
15013 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15014 ; AVX512DQ-NEXT: vpermt2d %xmm1, %xmm10, %xmm2
15015 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15016 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
15017 ; AVX512DQ-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
15018 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15019 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15020 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
15021 ; AVX512DQ-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
15022 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15023 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
15024 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
15025 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
15026 ; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3]
15027 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15028 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
15029 ; AVX512DQ-NEXT: # ymm3 = mem[0,1,1,3]
15030 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15031 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
15032 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15033 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
15034 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4]
15035 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
15036 ; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm17
15037 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
15038 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
15039 ; AVX512DQ-NEXT: # ymm30 = mem[0,1,1,3]
15040 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
15041 ; AVX512DQ-NEXT: # ymm4 = mem[0,1,1,3]
15042 ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15043 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm30[0,2,2,3,4,6,6,7]
15044 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
15045 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,2,2,3,4,6,6,7]
15046 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
15047 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
15048 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
15049 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
15050 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
15051 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15052 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15053 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
15054 ; AVX512DQ-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15055 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15056 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
15057 ; AVX512DQ-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
15058 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
15059 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
15060 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15061 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm23 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
15062 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
15063 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15064 ; AVX512DQ-NEXT: vpermt2d %xmm0, %xmm10, %xmm1
15065 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15066 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
15067 ; AVX512DQ-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15068 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15069 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
15070 ; AVX512DQ-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15071 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
15072 ; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm16
15073 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm20
15074 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
15075 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm12
15076 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
15077 ; AVX512DQ-NEXT: # ymm0 = mem[0,1,1,3]
15078 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15079 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
15080 ; AVX512DQ-NEXT: # ymm1 = mem[0,1,1,3]
15081 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15082 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,2,0,4,5,6,4]
15083 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
15084 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,2,0,4,5,6,4]
15085 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
15086 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
15087 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload
15088 ; AVX512DQ-NEXT: # ymm22 = mem[0,1,1,3]
15089 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
15090 ; AVX512DQ-NEXT: # ymm29 = mem[0,1,1,3]
15091 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm22[0,2,2,3,4,6,6,7]
15092 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
15093 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[0,2,2,3,4,6,6,7]
15094 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
15095 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5],ymm8[6,7]
15096 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
15097 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
15098 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15099 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
15100 ; AVX512DQ-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15101 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15102 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
15103 ; AVX512DQ-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15104 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15105 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
15106 ; AVX512DQ-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15107 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15108 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload
15109 ; AVX512DQ-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15110 ; AVX512DQ-NEXT: vmovdqa %xmm10, %xmm1
15111 ; AVX512DQ-NEXT: vpermi2d %xmm2, %xmm8, %xmm1
15112 ; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm19
15113 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm24
15114 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
15115 ; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm31
15116 ; AVX512DQ-NEXT: vmovdqa64 %xmm11, %xmm21
15117 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm14 = xmm0[0,1],xmm1[2,3]
15118 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
15119 ; AVX512DQ-NEXT: # ymm1 = mem[0,1,1,3]
15120 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15121 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
15122 ; AVX512DQ-NEXT: # ymm0 = mem[0,1,1,3]
15123 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15124 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
15125 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
15126 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,2,0,4,5,6,4]
15127 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
15128 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm11[0,1,2,3,4,5,6],ymm8[7]
15129 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
15130 ; AVX512DQ-NEXT: # ymm0 = mem[0,1,1,3]
15131 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15132 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
15133 ; AVX512DQ-NEXT: # ymm10 = mem[0,1,1,3]
15134 ; AVX512DQ-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15135 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,2,3,4,6,6,7]
15136 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
15137 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,2,2,3,4,6,6,7]
15138 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
15139 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
15140 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
15141 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
15142 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
15143 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15144 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm28[1,1,1,1]
15145 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
15146 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3]
15147 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
15148 ; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
15149 ; AVX512DQ-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
15150 ; AVX512DQ-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
15151 ; AVX512DQ-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
15152 ; AVX512DQ-NEXT: # ymm13 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
15153 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
15154 ; AVX512DQ-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
15155 ; AVX512DQ-NEXT: # ymm13 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
15156 ; AVX512DQ-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
15157 ; AVX512DQ-NEXT: # ymm14 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
15158 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
15159 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
15160 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
15161 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 {%k1}
15162 ; AVX512DQ-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
15163 ; AVX512DQ-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
15164 ; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm13
15165 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
15166 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
15167 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
15168 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
15169 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7]
15170 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7]
15171 ; AVX512DQ-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload
15172 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm17[1,1,1,1]
15173 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
15174 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm15[1],xmm9[2,3]
15175 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
15176 ; AVX512DQ-NEXT: # xmm9 = xmm9[0,1],mem[2,3]
15177 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
15178 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0
15179 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15180 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm16[1,1,1,1]
15181 ; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm7
15182 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
15183 ; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm9
15184 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3]
15185 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
15186 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
15187 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
15188 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
15189 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
15190 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
15191 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
15192 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
15193 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
15194 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm31[1,1,1,1]
15195 ; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm6
15196 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3]
15197 ; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm5
15198 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm19[0],xmm24[0],xmm19[1],xmm24[1]
15199 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
15200 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
15201 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
15202 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
15203 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
15204 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
15205 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
15206 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
15207 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
15208 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
15209 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15210 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm28[2],xmm10[2],xmm28[3],xmm10[3]
15211 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
15212 ; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
15213 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
15214 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
15215 ; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
15216 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
15217 ; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm27
15218 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
15219 ; AVX512DQ-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
15220 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
15221 ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm26
15222 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
15223 ; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
15224 ; AVX512DQ-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
15225 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
15226 ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm25
15227 ; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
15228 ; AVX512DQ-NEXT: # ymm13 = mem[3,1,2,3,7,5,6,7]
15229 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
15230 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
15231 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
15232 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
15233 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
15234 ; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
15235 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
15236 ; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm21
15237 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
15238 ; AVX512DQ-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7]
15239 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
15240 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
15241 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[3,1,2,3,7,5,6,7]
15242 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
15243 ; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
15244 ; AVX512DQ-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
15245 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
15246 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
15247 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
15248 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm15[2],xmm17[3],xmm15[3]
15249 ; AVX512DQ-NEXT: vmovdqa64 %xmm15, %xmm23
15250 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
15251 ; AVX512DQ-NEXT: # xmm2 = xmm2[0,1],mem[2,3]
15252 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
15253 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
15254 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15255 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm20[2],xmm16[3],xmm20[3]
15256 ; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm19
15257 ; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm30
15258 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
15259 ; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
15260 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20
15261 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
15262 ; AVX512DQ-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7]
15263 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
15264 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
15265 ; AVX512DQ-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
15266 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
15267 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
15268 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm22[3,1,2,3,7,5,6,7]
15269 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
15270 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[3,1,2,3,7,5,6,7]
15271 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
15272 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
15273 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
15274 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 {%k1}
15275 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm24[2,2,2,2]
15276 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
15277 ; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm22
15278 ; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm4
15279 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm6[2],xmm31[3],xmm6[3]
15280 ; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm29
15281 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
15282 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
15283 ; AVX512DQ-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7]
15284 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
15285 ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm18
15286 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
15287 ; AVX512DQ-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
15288 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
15289 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
15290 ; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
15291 ; AVX512DQ-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
15292 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
15293 ; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm31
15294 ; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
15295 ; AVX512DQ-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
15296 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
15297 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
15298 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
15299 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
15300 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
15301 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0]
15302 ; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm1
15303 ; AVX512DQ-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm16, %xmm1 # 16-byte Folded Reload
15304 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
15305 ; AVX512DQ-NEXT: # xmm1 = xmm1[0,1],mem[2,3]
15306 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
15307 ; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm3
15308 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
15309 ; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm6
15310 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
15311 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5,6],ymm3[7]
15312 ; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm6
15313 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
15314 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
15315 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
15316 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm3[6,7]
15317 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1}
15318 ; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm3
15319 ; AVX512DQ-NEXT: vpermt2d %xmm23, %xmm16, %xmm3
15320 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
15321 ; AVX512DQ-NEXT: # xmm3 = xmm3[0,1],mem[2,3]
15322 ; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm6
15323 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm13 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
15324 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
15325 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
15326 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
15327 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
15328 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7]
15329 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7]
15330 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
15331 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm1
15332 ; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm3
15333 ; AVX512DQ-NEXT: vpermt2d %xmm30, %xmm16, %xmm3
15334 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
15335 ; AVX512DQ-NEXT: # xmm3 = xmm3[0,1],mem[2,3]
15336 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3
15337 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
15338 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
15339 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7]
15340 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
15341 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
15342 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5],ymm6[6,7]
15343 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7]
15344 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm3 {%k1}
15345 ; AVX512DQ-NEXT: vpermt2d %xmm29, %xmm16, %xmm4
15346 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm22[2],xmm24[2],xmm22[3],xmm24[3]
15347 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3]
15348 ; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm7
15349 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
15350 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
15351 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
15352 ; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm4
15353 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
15354 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
15355 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
15356 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
15357 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
15358 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
15359 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15360 ; AVX512DQ-NEXT: vmovaps %zmm3, 64(%rsi)
15361 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15362 ; AVX512DQ-NEXT: vmovaps %zmm3, (%rsi)
15363 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15364 ; AVX512DQ-NEXT: vmovaps %zmm3, 64(%rdx)
15365 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15366 ; AVX512DQ-NEXT: vmovaps %zmm3, (%rdx)
15367 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15368 ; AVX512DQ-NEXT: vmovaps %zmm3, 64(%rcx)
15369 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15370 ; AVX512DQ-NEXT: vmovaps %zmm3, (%rcx)
15371 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15372 ; AVX512DQ-NEXT: vmovaps %zmm3, 64(%r8)
15373 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15374 ; AVX512DQ-NEXT: vmovaps %zmm3, (%r8)
15375 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15376 ; AVX512DQ-NEXT: vmovaps %zmm3, 64(%r9)
15377 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15378 ; AVX512DQ-NEXT: vmovaps %zmm3, (%r9)
15379 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
15380 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15381 ; AVX512DQ-NEXT: vmovaps %zmm3, 64(%rax)
15382 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15383 ; AVX512DQ-NEXT: vmovaps %zmm3, (%rax)
15384 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
15385 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rax)
15386 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15387 ; AVX512DQ-NEXT: vmovaps %zmm0, (%rax)
15388 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
15389 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rax)
15390 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax)
15391 ; AVX512DQ-NEXT: addq $2408, %rsp # imm = 0x968
15392 ; AVX512DQ-NEXT: vzeroupper
15393 ; AVX512DQ-NEXT: retq
15395 ; AVX512DQ-FCP-LABEL: load_i16_stride8_vf64:
15396 ; AVX512DQ-FCP: # %bb.0:
15397 ; AVX512DQ-FCP-NEXT: subq $2312, %rsp # imm = 0x908
15398 ; AVX512DQ-FCP-NEXT: vmovdqa 368(%rdi), %xmm0
15399 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15400 ; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %xmm1
15401 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15402 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
15403 ; AVX512DQ-FCP-NEXT: vmovdqa 336(%rdi), %xmm0
15404 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15405 ; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %xmm1
15406 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15407 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
15408 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4]
15409 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, %xmm0
15410 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm5, %xmm0
15411 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm24
15412 ; AVX512DQ-FCP-NEXT: vmovdqa 304(%rdi), %xmm1
15413 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15414 ; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %xmm2
15415 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15416 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
15417 ; AVX512DQ-FCP-NEXT: vmovdqa 272(%rdi), %xmm1
15418 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15419 ; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %xmm2
15420 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15421 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
15422 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
15423 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm30
15424 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15425 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
15426 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
15427 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
15428 ; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %ymm1
15429 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15430 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
15431 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15432 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
15433 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15434 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
15435 ; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm2
15436 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15437 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
15438 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15439 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
15440 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15441 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
15442 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
15443 ; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm2
15444 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15445 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2]
15446 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15447 ; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm2
15448 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15449 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2]
15450 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15451 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7]
15452 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15453 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
15454 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,2,2,3,4,6,6,7]
15455 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15456 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
15457 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
15458 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
15459 ; AVX512DQ-FCP-NEXT: movb $-64, %al
15460 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
15461 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
15462 ; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm1
15463 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15464 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm2
15465 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15466 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
15467 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm1
15468 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15469 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
15470 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15471 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
15472 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm1
15473 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm20
15474 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm3, %xmm5, %xmm1
15475 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm19
15476 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15477 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2
15478 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15479 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
15480 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15481 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm4
15482 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15483 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm6
15484 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15485 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
15486 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15487 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
15488 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
15489 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm31
15490 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
15491 ; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm2
15492 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15493 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2]
15494 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15495 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm2
15496 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15497 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2]
15498 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15499 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
15500 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
15501 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm28
15502 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[0,1,2,0,4,5,6,4]
15503 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
15504 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
15505 ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm3
15506 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15507 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,1,0,2]
15508 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15509 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm3
15510 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15511 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[0,1,0,2]
15512 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15513 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,2,2,3,4,6,6,7]
15514 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
15515 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,2,2,3,4,6,6,7]
15516 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
15517 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
15518 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
15519 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
15520 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
15521 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15522 ; AVX512DQ-FCP-NEXT: vmovdqa 880(%rdi), %xmm0
15523 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15524 ; AVX512DQ-FCP-NEXT: vmovdqa 864(%rdi), %xmm1
15525 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15526 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
15527 ; AVX512DQ-FCP-NEXT: vmovdqa 848(%rdi), %xmm0
15528 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15529 ; AVX512DQ-FCP-NEXT: vmovdqa 832(%rdi), %xmm1
15530 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15531 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
15532 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm0
15533 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm26
15534 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm2
15535 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm3, %xmm5, %xmm0
15536 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm27
15537 ; AVX512DQ-FCP-NEXT: vmovdqa 816(%rdi), %xmm1
15538 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15539 ; AVX512DQ-FCP-NEXT: vmovdqa 800(%rdi), %xmm3
15540 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15541 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
15542 ; AVX512DQ-FCP-NEXT: vmovdqa 784(%rdi), %xmm1
15543 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15544 ; AVX512DQ-FCP-NEXT: vmovdqa 768(%rdi), %xmm3
15545 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15546 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
15547 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
15548 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm23
15549 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm25
15550 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
15551 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm9
15552 ; AVX512DQ-FCP-NEXT: vmovdqa 992(%rdi), %ymm0
15553 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15554 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
15555 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
15556 ; AVX512DQ-FCP-NEXT: vmovdqa 960(%rdi), %ymm0
15557 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15558 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,2]
15559 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15560 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,2,0,4,5,6,4]
15561 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
15562 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,4]
15563 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
15564 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
15565 ; AVX512DQ-FCP-NEXT: vmovdqa 928(%rdi), %ymm1
15566 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15567 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,0,2]
15568 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15569 ; AVX512DQ-FCP-NEXT: vmovdqa 896(%rdi), %ymm1
15570 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15571 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm1[0,1,0,2]
15572 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15573 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
15574 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
15575 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[0,2,2,3,4,6,6,7]
15576 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
15577 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5],ymm12[6,7]
15578 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
15579 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm9 {%k1}
15580 ; AVX512DQ-FCP-NEXT: vmovdqa 624(%rdi), %xmm0
15581 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15582 ; AVX512DQ-FCP-NEXT: vmovdqa 608(%rdi), %xmm1
15583 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15584 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
15585 ; AVX512DQ-FCP-NEXT: vmovdqa 592(%rdi), %xmm0
15586 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15587 ; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %xmm1
15588 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15589 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
15590 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm0
15591 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm29
15592 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm11, %xmm2, %xmm0
15593 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm11, %xmm16
15594 ; AVX512DQ-FCP-NEXT: vmovdqa 560(%rdi), %xmm1
15595 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15596 ; AVX512DQ-FCP-NEXT: vmovdqa 544(%rdi), %xmm2
15597 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15598 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
15599 ; AVX512DQ-FCP-NEXT: vmovdqa 528(%rdi), %xmm1
15600 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15601 ; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %xmm2
15602 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15603 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
15604 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm11[0],xmm2[1],xmm11[1]
15605 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm22
15606 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm11, %xmm21
15607 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3]
15608 ; AVX512DQ-FCP-NEXT: vmovdqa 736(%rdi), %ymm0
15609 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15610 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
15611 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15612 ; AVX512DQ-FCP-NEXT: vmovdqa 704(%rdi), %ymm0
15613 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15614 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2]
15615 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15616 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
15617 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
15618 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm18
15619 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
15620 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
15621 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm0[7]
15622 ; AVX512DQ-FCP-NEXT: vmovdqa 672(%rdi), %ymm0
15623 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15624 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
15625 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15626 ; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm0
15627 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15628 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm0[0,1,0,2]
15629 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15630 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,2,2,3,4,6,6,7]
15631 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
15632 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,2,2,3,4,6,6,7]
15633 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
15634 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7]
15635 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7]
15636 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
15637 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
15638 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15639 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [1,5,0,0]
15640 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm0
15641 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm17, %xmm13, %xmm0
15642 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm30
15643 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm14[0],xmm24[0],xmm14[1],xmm24[1]
15644 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3]
15645 ; AVX512DQ-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
15646 ; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
15647 ; AVX512DQ-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
15648 ; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
15649 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
15650 ; AVX512DQ-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
15651 ; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
15652 ; AVX512DQ-FCP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
15653 ; AVX512DQ-FCP-NEXT: # ymm12 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
15654 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
15655 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
15656 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
15657 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 {%k1}
15658 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm12
15659 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm9
15660 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload
15661 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm17, %xmm13, %xmm9
15662 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm20[0],xmm19[0],xmm20[1],xmm19[1]
15663 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3]
15664 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm11
15665 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
15666 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
15667 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
15668 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
15669 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
15670 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
15671 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7]
15672 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
15673 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0
15674 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15675 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm9
15676 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm0
15677 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm11
15678 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm25, %xmm13, %xmm0
15679 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm8
15680 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm26[0],xmm27[0],xmm26[1],xmm27[1]
15681 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm19
15682 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
15683 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
15684 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
15685 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
15686 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
15687 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
15688 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
15689 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
15690 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
15691 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
15692 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm3
15693 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm21, %xmm13, %xmm3
15694 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm29[0],xmm16[0],xmm29[1],xmm16[1]
15695 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm6
15696 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
15697 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm4
15698 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
15699 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
15700 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
15701 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
15702 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
15703 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
15704 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
15705 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
15706 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
15707 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15708 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, %xmm0
15709 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm24[2],xmm14[3],xmm24[3]
15710 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15711 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6]
15712 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm24, %xmm5, %xmm0
15713 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload
15714 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm28
15715 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm30[2],xmm31[3],xmm30[3]
15716 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
15717 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
15718 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
15719 ; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
15720 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15721 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
15722 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
15723 ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
15724 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
15725 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm29
15726 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
15727 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
15728 ; AVX512DQ-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
15729 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
15730 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
15731 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
15732 ; AVX512DQ-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
15733 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
15734 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm27
15735 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
15736 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
15737 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
15738 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm1
15739 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
15740 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm26 = xmm20[2],xmm2[2],xmm20[3],xmm2[3]
15741 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm5, %xmm1
15742 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm17[2],xmm12[3],xmm17[3]
15743 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm12, %xmm25
15744 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm23
15745 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
15746 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
15747 ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
15748 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
15749 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm18
15750 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
15751 ; AVX512DQ-FCP-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7]
15752 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
15753 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
15754 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
15755 ; AVX512DQ-FCP-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7]
15756 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
15757 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
15758 ; AVX512DQ-FCP-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
15759 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
15760 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
15761 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
15762 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
15763 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
15764 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15765 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm0
15766 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm17 = xmm19[2],xmm8[2],xmm19[3],xmm8[3]
15767 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm8, %xmm5, %xmm0
15768 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm9, %xmm19
15769 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm11, %xmm20
15770 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
15771 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
15772 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm24
15773 ; AVX512DQ-FCP-NEXT: vpshufd $212, (%rsp), %ymm9 # 32-byte Folded Reload
15774 ; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
15775 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
15776 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
15777 ; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7]
15778 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
15779 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
15780 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
15781 ; AVX512DQ-FCP-NEXT: # ymm8 = mem[3,1,2,3,7,5,6,7]
15782 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
15783 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
15784 ; AVX512DQ-FCP-NEXT: # ymm7 = mem[3,1,2,3,7,5,6,7]
15785 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
15786 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
15787 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
15788 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm24 {%k1}
15789 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, %xmm0
15790 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm6[2],xmm16[2],xmm6[3],xmm16[3]
15791 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm16, %xmm5, %xmm0
15792 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm22[2],xmm21[2],xmm22[3],xmm21[3]
15793 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
15794 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
15795 ; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7]
15796 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
15797 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
15798 ; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
15799 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
15800 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
15801 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
15802 ; AVX512DQ-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
15803 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
15804 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
15805 ; AVX512DQ-FCP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
15806 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
15807 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
15808 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
15809 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
15810 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
15811 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15812 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0]
15813 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm1
15814 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm28, %xmm16, %xmm1
15815 ; AVX512DQ-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
15816 ; AVX512DQ-FCP-NEXT: # xmm0 = xmm1[0,1],mem[2,3]
15817 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
15818 ; AVX512DQ-FCP-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
15819 ; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
15820 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm3
15821 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
15822 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
15823 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm3
15824 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
15825 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm15
15826 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
15827 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7]
15828 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
15829 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
15830 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm1
15831 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm23, %xmm16, %xmm1
15832 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm3
15833 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
15834 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm3
15835 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
15836 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
15837 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7]
15838 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
15839 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
15840 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
15841 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7]
15842 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
15843 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
15844 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15845 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm0
15846 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm20, %xmm16, %xmm0
15847 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm1
15848 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
15849 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
15850 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
15851 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
15852 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
15853 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
15854 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
15855 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7]
15856 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
15857 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
15858 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm1
15859 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm21, %xmm16, %xmm1
15860 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
15861 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
15862 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
15863 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
15864 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
15865 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
15866 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
15867 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
15868 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
15869 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
15870 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15871 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15872 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
15873 ; AVX512DQ-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15874 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15875 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
15876 ; AVX512DQ-FCP-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15877 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15878 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
15879 ; AVX512DQ-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15880 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15881 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
15882 ; AVX512DQ-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15883 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, %xmm0
15884 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4]
15885 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm5, %xmm0
15886 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm29
15887 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
15888 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm23
15889 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm19
15890 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
15891 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
15892 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
15893 ; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,1,3]
15894 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15895 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
15896 ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,1,3]
15897 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15898 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
15899 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15900 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
15901 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
15902 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15903 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
15904 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
15905 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
15906 ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,1,3]
15907 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15908 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
15909 ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,1,3]
15910 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15911 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
15912 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15913 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
15914 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
15915 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15916 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
15917 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
15918 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
15919 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
15920 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15921 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload
15922 ; AVX512DQ-FCP-NEXT: # xmm6 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
15923 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15924 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
15925 ; AVX512DQ-FCP-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
15926 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15927 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
15928 ; AVX512DQ-FCP-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
15929 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15930 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
15931 ; AVX512DQ-FCP-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
15932 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm1
15933 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm27
15934 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm6, %xmm5, %xmm1
15935 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm18
15936 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
15937 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm22
15938 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm24
15939 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
15940 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
15941 ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,1,3]
15942 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15943 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
15944 ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,1,3]
15945 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15946 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
15947 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15948 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
15949 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4]
15950 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
15951 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm26
15952 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
15953 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
15954 ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,1,3]
15955 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15956 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
15957 ; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,1,3]
15958 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15959 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[0,2,2,3,4,6,6,7]
15960 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
15961 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[0,2,2,3,4,6,6,7]
15962 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
15963 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
15964 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
15965 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
15966 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
15967 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15968 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15969 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
15970 ; AVX512DQ-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15971 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15972 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
15973 ; AVX512DQ-FCP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15974 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15975 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
15976 ; AVX512DQ-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15977 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15978 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
15979 ; AVX512DQ-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15980 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm0
15981 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm31
15982 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm5, %xmm0
15983 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm20
15984 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
15985 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm21
15986 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15987 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm25
15988 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
15989 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm12
15990 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
15991 ; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,1,3]
15992 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15993 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
15994 ; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,1,3]
15995 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15996 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,1,2,0,4,5,6,4]
15997 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
15998 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,0,4,5,6,4]
15999 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
16000 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
16001 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
16002 ; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,1,3]
16003 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16004 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
16005 ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,1,3]
16006 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16007 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[0,2,2,3,4,6,6,7]
16008 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
16009 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,2,2,3,4,6,6,7]
16010 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
16011 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
16012 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
16013 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
16014 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
16015 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
16016 ; AVX512DQ-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
16017 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
16018 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
16019 ; AVX512DQ-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
16020 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
16021 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload
16022 ; AVX512DQ-FCP-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
16023 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
16024 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
16025 ; AVX512DQ-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
16026 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm1
16027 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm1
16028 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
16029 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm16
16030 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm13[0],xmm4[1],xmm13[1]
16031 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm13, %xmm28
16032 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm30
16033 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16034 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm1[2,3]
16035 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
16036 ; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,1,3]
16037 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16038 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
16039 ; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,1,3]
16040 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16041 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,0,4,5,6,4]
16042 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
16043 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,2,0,4,5,6,4]
16044 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
16045 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7]
16046 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
16047 ; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,1,3]
16048 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16049 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
16050 ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,1,3]
16051 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16052 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
16053 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
16054 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
16055 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
16056 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
16057 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
16058 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
16059 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
16060 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16061 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm0
16062 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, (%rsp) # 16-byte Spill
16063 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0]
16064 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm19, %xmm15, %xmm0
16065 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16066 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm29[0],xmm9[1],xmm29[1]
16067 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
16068 ; AVX512DQ-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
16069 ; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
16070 ; AVX512DQ-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
16071 ; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
16072 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7]
16073 ; AVX512DQ-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
16074 ; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
16075 ; AVX512DQ-FCP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
16076 ; AVX512DQ-FCP-NEXT: # ymm13 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
16077 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7]
16078 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
16079 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7]
16080 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
16081 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm1
16082 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm24, %xmm15, %xmm1
16083 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm27[0],xmm18[0],xmm27[1],xmm18[1]
16084 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3]
16085 ; AVX512DQ-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
16086 ; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
16087 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm13
16088 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
16089 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
16090 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
16091 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
16092 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7]
16093 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm12[6,7]
16094 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
16095 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
16096 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16097 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm0
16098 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm25, %xmm15, %xmm0
16099 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm13
16100 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm31[0],xmm20[0],xmm31[1],xmm20[1]
16101 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm21
16102 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
16103 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
16104 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
16105 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7]
16106 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
16107 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
16108 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
16109 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
16110 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
16111 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
16112 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm15, %xmm6
16113 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm28, %xmm30, %xmm6
16114 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm16[0],xmm17[1],xmm16[1]
16115 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm7
16116 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
16117 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
16118 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
16119 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
16120 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
16121 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
16122 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
16123 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
16124 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
16125 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
16126 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16127 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, %xmm0
16128 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm29[2],xmm9[3],xmm29[3]
16129 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16130 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6]
16131 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm29, %xmm5, %xmm0
16132 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm23[2],xmm19[2],xmm23[3],xmm19[3]
16133 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
16134 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
16135 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
16136 ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
16137 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
16138 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm31
16139 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
16140 ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
16141 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
16142 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
16143 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
16144 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
16145 ; AVX512DQ-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
16146 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
16147 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm26
16148 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
16149 ; AVX512DQ-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
16150 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
16151 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm23
16152 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
16153 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
16154 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
16155 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm1
16156 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm20 = xmm27[2],xmm18[2],xmm27[3],xmm18[3]
16157 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm18, %xmm5, %xmm1
16158 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm22[2],xmm24[2],xmm22[3],xmm24[3]
16159 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm19
16160 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm18
16161 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
16162 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
16163 ; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7]
16164 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
16165 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
16166 ; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
16167 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
16168 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
16169 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
16170 ; AVX512DQ-FCP-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
16171 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
16172 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
16173 ; AVX512DQ-FCP-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
16174 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
16175 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
16176 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
16177 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
16178 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
16179 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16180 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm0
16181 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm21[2],xmm13[2],xmm21[3],xmm13[3]
16182 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm13, %xmm5, %xmm0
16183 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm22
16184 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm24 # 16-byte Reload
16185 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm24[2],xmm25[2],xmm24[3],xmm25[3]
16186 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
16187 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm21
16188 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
16189 ; AVX512DQ-FCP-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7]
16190 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
16191 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
16192 ; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,1,3,4,5,5,7]
16193 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
16194 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
16195 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
16196 ; AVX512DQ-FCP-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7]
16197 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
16198 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
16199 ; AVX512DQ-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
16200 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
16201 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
16202 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
16203 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm21 {%k1}
16204 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm1
16205 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm25
16206 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm27
16207 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm7, %xmm17, %xmm1
16208 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
16209 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm29[2],xmm28[2],xmm29[3],xmm28[3]
16210 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3]
16211 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
16212 ; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
16213 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
16214 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
16215 ; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7]
16216 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
16217 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
16218 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
16219 ; AVX512DQ-FCP-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7]
16220 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
16221 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
16222 ; AVX512DQ-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
16223 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
16224 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
16225 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
16226 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
16227 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
16228 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0]
16229 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
16230 ; AVX512DQ-FCP-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm17, %xmm0 # 16-byte Folded Reload
16231 ; AVX512DQ-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
16232 ; AVX512DQ-FCP-NEXT: # xmm1 = xmm0[0,1],mem[2,3]
16233 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
16234 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm0
16235 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
16236 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm0
16237 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm0[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
16238 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm2[7]
16239 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0
16240 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
16241 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm0
16242 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
16243 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7]
16244 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
16245 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
16246 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm0
16247 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm18, %xmm17, %xmm0
16248 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm2
16249 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
16250 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
16251 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
16252 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7]
16253 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
16254 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
16255 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
16256 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7]
16257 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
16258 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
16259 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm1
16260 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm22, %xmm17, %xmm1
16261 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm2
16262 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
16263 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
16264 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
16265 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm15[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
16266 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7]
16267 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
16268 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
16269 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7]
16270 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
16271 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 {%k1}
16272 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm4
16273 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm28, %xmm17, %xmm4
16274 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm27[2],xmm25[2],xmm27[3],xmm25[3]
16275 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
16276 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
16277 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
16278 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
16279 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
16280 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
16281 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7]
16282 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
16283 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
16284 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
16285 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16286 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rsi)
16287 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16288 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rsi)
16289 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16290 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rdx)
16291 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16292 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rdx)
16293 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16294 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rcx)
16295 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16296 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rcx)
16297 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16298 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%r8)
16299 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16300 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%r8)
16301 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16302 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%r9)
16303 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16304 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%r9)
16305 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
16306 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16307 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rax)
16308 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16309 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rax)
16310 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
16311 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 64(%rax)
16312 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16313 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rax)
16314 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
16315 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax)
16316 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
16317 ; AVX512DQ-FCP-NEXT: addq $2312, %rsp # imm = 0x908
16318 ; AVX512DQ-FCP-NEXT: vzeroupper
16319 ; AVX512DQ-FCP-NEXT: retq
16321 ; AVX512BW-LABEL: load_i16_stride8_vf64:
16322 ; AVX512BW: # %bb.0:
16323 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
16324 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
16325 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
16326 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
16327 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm29
16328 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1
16329 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm30
16330 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm31
16331 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3
16332 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7
16333 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6
16334 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9
16335 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm5
16336 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm12
16337 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2
16338 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm14
16339 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm11
16340 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm16
16341 ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm15
16342 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
16343 ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16344 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17
16345 ; AVX512BW-NEXT: vpermt2w %zmm16, %zmm13, %zmm17
16346 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm18
16347 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm13, %zmm18
16348 ; AVX512BW-NEXT: movb $-64, %dil
16349 ; AVX512BW-NEXT: kmovd %edi, %k1
16350 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1}
16351 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10
16352 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm13, %zmm10
16353 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8
16354 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm13, %zmm8
16355 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
16356 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28
16357 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8
16358 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm8
16359 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
16360 ; AVX512BW-NEXT: vpermt2w %zmm31, %zmm13, %zmm10
16361 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16362 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8
16363 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm13, %zmm8
16364 ; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm13
16365 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16366 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17
16367 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
16368 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16369 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm10
16370 ; AVX512BW-NEXT: vpermt2w %zmm16, %zmm8, %zmm10
16371 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13
16372 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm8, %zmm13
16373 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
16374 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10
16375 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm8, %zmm10
16376 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4
16377 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm8, %zmm4
16378 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
16379 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18
16380 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4
16381 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm4
16382 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
16383 ; AVX512BW-NEXT: vpermt2w %zmm31, %zmm8, %zmm10
16384 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1}
16385 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4
16386 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm8, %zmm4
16387 ; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm8
16388 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
16389 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19
16390 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
16391 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16392 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8
16393 ; AVX512BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16394 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10
16395 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16396 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16397 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8
16398 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16399 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13
16400 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16401 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16402 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20
16403 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8
16404 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16405 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
16406 ; AVX512BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16407 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16408 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8
16409 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16410 ; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16411 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16412 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21
16413 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
16414 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16415 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8
16416 ; AVX512BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16417 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10
16418 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16419 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16420 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8
16421 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16422 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13
16423 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16424 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16425 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22
16426 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8
16427 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16428 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
16429 ; AVX512BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16430 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16431 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8
16432 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16433 ; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16434 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16435 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23
16436 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
16437 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16438 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8
16439 ; AVX512BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16440 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10
16441 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16442 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16443 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8
16444 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16445 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13
16446 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16447 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16448 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24
16449 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8
16450 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16451 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
16452 ; AVX512BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16453 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16454 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8
16455 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16456 ; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16457 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16458 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25
16459 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
16460 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16461 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8
16462 ; AVX512BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16463 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10
16464 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16465 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16466 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8
16467 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16468 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13
16469 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16470 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16471 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26
16472 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8
16473 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16474 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
16475 ; AVX512BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16476 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16477 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8
16478 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16479 ; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16480 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16481 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27
16482 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
16483 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16484 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8
16485 ; AVX512BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16486 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10
16487 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16488 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16489 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8
16490 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16491 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13
16492 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16493 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16494 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
16495 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10
16496 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm10
16497 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13
16498 ; AVX512BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm13
16499 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
16500 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10
16501 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm10
16502 ; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16503 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
16504 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4
16505 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
16506 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16507 ; AVX512BW-NEXT: vpermt2w %zmm16, %zmm10, %zmm15
16508 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm10, %zmm11
16509 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1}
16510 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm10, %zmm2
16511 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm10, %zmm5
16512 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
16513 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
16514 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm6
16515 ; AVX512BW-NEXT: vpermt2w %zmm31, %zmm10, %zmm3
16516 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
16517 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm10, %zmm1
16518 ; AVX512BW-NEXT: vpermt2w %zmm29, %zmm10, %zmm0
16519 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16520 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
16521 ; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rsi)
16522 ; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rsi)
16523 ; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rdx)
16524 ; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rdx)
16525 ; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rcx)
16526 ; AVX512BW-NEXT: vmovdqa64 %zmm21, (%rcx)
16527 ; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8)
16528 ; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r8)
16529 ; AVX512BW-NEXT: vmovdqa64 %zmm24, 64(%r9)
16530 ; AVX512BW-NEXT: vmovdqa64 %zmm25, (%r9)
16531 ; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%r11)
16532 ; AVX512BW-NEXT: vmovdqa64 %zmm27, (%r11)
16533 ; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%r10)
16534 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r10)
16535 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax)
16536 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax)
16537 ; AVX512BW-NEXT: vzeroupper
16538 ; AVX512BW-NEXT: retq
16540 ; AVX512BW-FCP-LABEL: load_i16_stride8_vf64:
16541 ; AVX512BW-FCP: # %bb.0:
16542 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
16543 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
16544 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
16545 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
16546 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29
16547 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
16548 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30
16549 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31
16550 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3
16551 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
16552 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6
16553 ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9
16554 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5
16555 ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12
16556 ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2
16557 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14
16558 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11
16559 ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm16
16560 ; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm15
16561 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
16562 ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16563 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17
16564 ; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm13, %zmm17
16565 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18
16566 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm13, %zmm18
16567 ; AVX512BW-FCP-NEXT: movb $-64, %dil
16568 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1
16569 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1}
16570 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
16571 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm13, %zmm10
16572 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8
16573 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm13, %zmm8
16574 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
16575 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28
16576 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
16577 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm13, %zmm8
16578 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
16579 ; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm13, %zmm10
16580 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16581 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
16582 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm13, %zmm8
16583 ; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm13
16584 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16585 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17
16586 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
16587 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16588 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10
16589 ; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm8, %zmm10
16590 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13
16591 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm8, %zmm13
16592 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
16593 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
16594 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm8, %zmm10
16595 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4
16596 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm8, %zmm4
16597 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
16598 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18
16599 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4
16600 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm8, %zmm4
16601 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
16602 ; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm8, %zmm10
16603 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1}
16604 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
16605 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm8, %zmm4
16606 ; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm8
16607 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
16608 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19
16609 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
16610 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16611 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
16612 ; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16613 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
16614 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16615 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16616 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
16617 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16618 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
16619 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16620 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16621 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20
16622 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
16623 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16624 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
16625 ; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16626 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16627 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
16628 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16629 ; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16630 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16631 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21
16632 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
16633 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16634 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
16635 ; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16636 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
16637 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16638 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16639 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
16640 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16641 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
16642 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16643 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16644 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22
16645 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
16646 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16647 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
16648 ; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16649 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16650 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
16651 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16652 ; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16653 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16654 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23
16655 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
16656 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16657 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
16658 ; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16659 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
16660 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16661 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16662 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
16663 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16664 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
16665 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16666 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16667 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24
16668 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
16669 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16670 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
16671 ; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16672 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16673 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
16674 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16675 ; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16676 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16677 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25
16678 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
16679 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16680 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
16681 ; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16682 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
16683 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16684 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16685 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
16686 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16687 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
16688 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16689 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16690 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26
16691 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
16692 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16693 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
16694 ; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16695 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16696 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
16697 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16698 ; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16699 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16700 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27
16701 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
16702 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16703 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
16704 ; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16705 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
16706 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16707 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16708 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
16709 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16710 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
16711 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16712 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16713 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
16714 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10
16715 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm10
16716 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
16717 ; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm13
16718 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
16719 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
16720 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm10
16721 ; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16722 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
16723 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4
16724 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
16725 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16726 ; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm10, %zmm15
16727 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm10, %zmm11
16728 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1}
16729 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm10, %zmm2
16730 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm10, %zmm5
16731 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
16732 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
16733 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm10, %zmm6
16734 ; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm10, %zmm3
16735 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
16736 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm10, %zmm1
16737 ; AVX512BW-FCP-NEXT: vpermt2w %zmm29, %zmm10, %zmm0
16738 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16739 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
16740 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 64(%rsi)
16741 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%rsi)
16742 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rdx)
16743 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx)
16744 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rcx)
16745 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, (%rcx)
16746 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8)
16747 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, (%r8)
16748 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%r9)
16749 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, (%r9)
16750 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%r11)
16751 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, (%r11)
16752 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%r10)
16753 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10)
16754 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax)
16755 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
16756 ; AVX512BW-FCP-NEXT: vzeroupper
16757 ; AVX512BW-FCP-NEXT: retq
16759 ; AVX512DQ-BW-LABEL: load_i16_stride8_vf64:
16760 ; AVX512DQ-BW: # %bb.0:
16761 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
16762 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
16763 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
16764 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
16765 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm29
16766 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1
16767 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm30
16768 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm31
16769 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm3
16770 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm7
16771 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm6
16772 ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm9
16773 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm5
16774 ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm12
16775 ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm2
16776 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm14
16777 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm11
16778 ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm16
16779 ; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm15
16780 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
16781 ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16782 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm17
16783 ; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm13, %zmm17
16784 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm18
16785 ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm13, %zmm18
16786 ; AVX512DQ-BW-NEXT: movb $-64, %dil
16787 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1
16788 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1}
16789 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10
16790 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm13, %zmm10
16791 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm8
16792 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm13, %zmm8
16793 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
16794 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28
16795 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8
16796 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm8
16797 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10
16798 ; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm13, %zmm10
16799 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16800 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8
16801 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm13, %zmm8
16802 ; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm13
16803 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16804 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17
16805 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
16806 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16807 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm10
16808 ; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm8, %zmm10
16809 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm13
16810 ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm8, %zmm13
16811 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
16812 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10
16813 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm8, %zmm10
16814 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm4
16815 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm8, %zmm4
16816 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
16817 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18
16818 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm4
16819 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm4
16820 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10
16821 ; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm8, %zmm10
16822 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1}
16823 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4
16824 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm8, %zmm4
16825 ; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm8
16826 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
16827 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19
16828 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
16829 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16830 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8
16831 ; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16832 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10
16833 ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16834 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16835 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8
16836 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16837 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13
16838 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16839 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16840 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20
16841 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8
16842 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16843 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10
16844 ; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16845 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16846 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8
16847 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16848 ; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16849 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16850 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21
16851 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
16852 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16853 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8
16854 ; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16855 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10
16856 ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16857 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16858 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8
16859 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16860 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13
16861 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16862 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16863 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22
16864 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8
16865 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16866 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10
16867 ; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16868 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16869 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8
16870 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16871 ; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16872 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16873 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23
16874 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
16875 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16876 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8
16877 ; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16878 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10
16879 ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16880 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16881 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8
16882 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16883 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13
16884 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16885 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16886 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24
16887 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8
16888 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16889 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10
16890 ; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16891 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16892 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8
16893 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16894 ; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16895 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16896 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25
16897 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
16898 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16899 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8
16900 ; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16901 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10
16902 ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16903 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16904 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8
16905 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16906 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13
16907 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16908 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16909 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26
16910 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8
16911 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16912 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10
16913 ; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16914 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16915 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8
16916 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16917 ; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16918 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16919 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27
16920 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
16921 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16922 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8
16923 ; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16924 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10
16925 ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16926 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16927 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8
16928 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16929 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13
16930 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16931 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16932 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
16933 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10
16934 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm10
16935 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13
16936 ; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm13
16937 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
16938 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10
16939 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm10
16940 ; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16941 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
16942 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4
16943 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
16944 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16945 ; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm10, %zmm15
16946 ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm10, %zmm11
16947 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1}
16948 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm10, %zmm2
16949 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm10, %zmm5
16950 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
16951 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
16952 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm6
16953 ; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm10, %zmm3
16954 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
16955 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm10, %zmm1
16956 ; AVX512DQ-BW-NEXT: vpermt2w %zmm29, %zmm10, %zmm0
16957 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16958 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
16959 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 64(%rsi)
16960 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%rsi)
16961 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 64(%rdx)
16962 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rdx)
16963 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 64(%rcx)
16964 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, (%rcx)
16965 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%r8)
16966 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, (%r8)
16967 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 64(%r9)
16968 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, (%r9)
16969 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 64(%r11)
16970 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, (%r11)
16971 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 64(%r10)
16972 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%r10)
16973 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rax)
16974 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax)
16975 ; AVX512DQ-BW-NEXT: vzeroupper
16976 ; AVX512DQ-BW-NEXT: retq
16978 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride8_vf64:
16979 ; AVX512DQ-BW-FCP: # %bb.0:
16980 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
16981 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
16982 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
16983 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
16984 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29
16985 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
16986 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30
16987 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31
16988 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3
16989 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
16990 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6
16991 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9
16992 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5
16993 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12
16994 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2
16995 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14
16996 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11
16997 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm16
16998 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm15
16999 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
17000 ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17001 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17
17002 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm13, %zmm17
17003 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18
17004 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm13, %zmm18
17005 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %dil
17006 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1
17007 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1}
17008 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
17009 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm13, %zmm10
17010 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8
17011 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm13, %zmm8
17012 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
17013 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28
17014 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
17015 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm13, %zmm8
17016 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
17017 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm13, %zmm10
17018 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
17019 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
17020 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm13, %zmm8
17021 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm13
17022 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
17023 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17
17024 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
17025 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17026 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10
17027 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm8, %zmm10
17028 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13
17029 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm8, %zmm13
17030 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
17031 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
17032 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm8, %zmm10
17033 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4
17034 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm8, %zmm4
17035 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
17036 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18
17037 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4
17038 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm8, %zmm4
17039 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
17040 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm8, %zmm10
17041 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1}
17042 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
17043 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm8, %zmm4
17044 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm8
17045 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
17046 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19
17047 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
17048 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17049 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
17050 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
17051 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
17052 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
17053 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
17054 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
17055 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
17056 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
17057 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
17058 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
17059 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20
17060 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
17061 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
17062 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
17063 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
17064 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
17065 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
17066 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
17067 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
17068 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
17069 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21
17070 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
17071 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17072 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
17073 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
17074 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
17075 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
17076 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
17077 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
17078 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
17079 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
17080 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
17081 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
17082 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22
17083 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
17084 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
17085 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
17086 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
17087 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
17088 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
17089 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
17090 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
17091 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
17092 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23
17093 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
17094 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17095 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
17096 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
17097 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
17098 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
17099 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
17100 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
17101 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
17102 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
17103 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
17104 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
17105 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24
17106 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
17107 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
17108 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
17109 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
17110 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
17111 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
17112 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
17113 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
17114 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
17115 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25
17116 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
17117 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17118 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
17119 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
17120 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
17121 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
17122 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
17123 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
17124 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
17125 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
17126 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
17127 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
17128 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26
17129 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
17130 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
17131 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
17132 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
17133 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
17134 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
17135 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
17136 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
17137 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
17138 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27
17139 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
17140 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17141 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
17142 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
17143 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
17144 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
17145 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
17146 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
17147 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
17148 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
17149 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
17150 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
17151 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
17152 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10
17153 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm10
17154 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
17155 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm13
17156 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
17157 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
17158 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm10
17159 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
17160 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
17161 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4
17162 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
17163 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17164 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm10, %zmm15
17165 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm10, %zmm11
17166 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1}
17167 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm10, %zmm2
17168 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm10, %zmm5
17169 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
17170 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
17171 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm10, %zmm6
17172 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm10, %zmm3
17173 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
17174 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm10, %zmm1
17175 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm29, %zmm10, %zmm0
17176 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17177 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
17178 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 64(%rsi)
17179 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%rsi)
17180 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rdx)
17181 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx)
17182 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rcx)
17183 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, (%rcx)
17184 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8)
17185 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, (%r8)
17186 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%r9)
17187 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, (%r9)
17188 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%r11)
17189 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, (%r11)
17190 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%r10)
17191 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10)
17192 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax)
17193 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
17194 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
17195 ; AVX512DQ-BW-FCP-NEXT: retq
17196 %wide.vec = load <512 x i16>, ptr %in.vec, align 64
17197 %strided.vec0 = shufflevector <512 x i16> %wide.vec, <512 x i16> poison, <64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120, i32 128, i32 136, i32 144, i32 152, i32 160, i32 168, i32 176, i32 184, i32 192, i32 200, i32 208, i32 216, i32 224, i32 232, i32 240, i32 248, i32 256, i32 264, i32 272, i32 280, i32 288, i32 296, i32 304, i32 312, i32 320, i32 328, i32 336, i32 344, i32 352, i32 360, i32 368, i32 376, i32 384, i32 392, i32 400, i32 408, i32 416, i32 424, i32 432, i32 440, i32 448, i32 456, i32 464, i32 472, i32 480, i32 488, i32 496, i32 504>
17198 %strided.vec1 = shufflevector <512 x i16> %wide.vec, <512 x i16> poison, <64 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121, i32 129, i32 137, i32 145, i32 153, i32 161, i32 169, i32 177, i32 185, i32 193, i32 201, i32 209, i32 217, i32 225, i32 233, i32 241, i32 249, i32 257, i32 265, i32 273, i32 281, i32 289, i32 297, i32 305, i32 313, i32 321, i32 329, i32 337, i32 345, i32 353, i32 361, i32 369, i32 377, i32 385, i32 393, i32 401, i32 409, i32 417, i32 425, i32 433, i32 441, i32 449, i32 457, i32 465, i32 473, i32 481, i32 489, i32 497, i32 505>
17199 %strided.vec2 = shufflevector <512 x i16> %wide.vec, <512 x i16> poison, <64 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 66, i32 74, i32 82, i32 90, i32 98, i32 106, i32 114, i32 122, i32 130, i32 138, i32 146, i32 154, i32 162, i32 170, i32 178, i32 186, i32 194, i32 202, i32 210, i32 218, i32 226, i32 234, i32 242, i32 250, i32 258, i32 266, i32 274, i32 282, i32 290, i32 298, i32 306, i32 314, i32 322, i32 330, i32 338, i32 346, i32 354, i32 362, i32 370, i32 378, i32 386, i32 394, i32 402, i32 410, i32 418, i32 426, i32 434, i32 442, i32 450, i32 458, i32 466, i32 474, i32 482, i32 490, i32 498, i32 506>
17200 %strided.vec3 = shufflevector <512 x i16> %wide.vec, <512 x i16> poison, <64 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 67, i32 75, i32 83, i32 91, i32 99, i32 107, i32 115, i32 123, i32 131, i32 139, i32 147, i32 155, i32 163, i32 171, i32 179, i32 187, i32 195, i32 203, i32 211, i32 219, i32 227, i32 235, i32 243, i32 251, i32 259, i32 267, i32 275, i32 283, i32 291, i32 299, i32 307, i32 315, i32 323, i32 331, i32 339, i32 347, i32 355, i32 363, i32 371, i32 379, i32 387, i32 395, i32 403, i32 411, i32 419, i32 427, i32 435, i32 443, i32 451, i32 459, i32 467, i32 475, i32 483, i32 491, i32 499, i32 507>
17201 %strided.vec4 = shufflevector <512 x i16> %wide.vec, <512 x i16> poison, <64 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 68, i32 76, i32 84, i32 92, i32 100, i32 108, i32 116, i32 124, i32 132, i32 140, i32 148, i32 156, i32 164, i32 172, i32 180, i32 188, i32 196, i32 204, i32 212, i32 220, i32 228, i32 236, i32 244, i32 252, i32 260, i32 268, i32 276, i32 284, i32 292, i32 300, i32 308, i32 316, i32 324, i32 332, i32 340, i32 348, i32 356, i32 364, i32 372, i32 380, i32 388, i32 396, i32 404, i32 412, i32 420, i32 428, i32 436, i32 444, i32 452, i32 460, i32 468, i32 476, i32 484, i32 492, i32 500, i32 508>
17202 %strided.vec5 = shufflevector <512 x i16> %wide.vec, <512 x i16> poison, <64 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125, i32 133, i32 141, i32 149, i32 157, i32 165, i32 173, i32 181, i32 189, i32 197, i32 205, i32 213, i32 221, i32 229, i32 237, i32 245, i32 253, i32 261, i32 269, i32 277, i32 285, i32 293, i32 301, i32 309, i32 317, i32 325, i32 333, i32 341, i32 349, i32 357, i32 365, i32 373, i32 381, i32 389, i32 397, i32 405, i32 413, i32 421, i32 429, i32 437, i32 445, i32 453, i32 461, i32 469, i32 477, i32 485, i32 493, i32 501, i32 509>
17203 %strided.vec6 = shufflevector <512 x i16> %wide.vec, <512 x i16> poison, <64 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126, i32 134, i32 142, i32 150, i32 158, i32 166, i32 174, i32 182, i32 190, i32 198, i32 206, i32 214, i32 222, i32 230, i32 238, i32 246, i32 254, i32 262, i32 270, i32 278, i32 286, i32 294, i32 302, i32 310, i32 318, i32 326, i32 334, i32 342, i32 350, i32 358, i32 366, i32 374, i32 382, i32 390, i32 398, i32 406, i32 414, i32 422, i32 430, i32 438, i32 446, i32 454, i32 462, i32 470, i32 478, i32 486, i32 494, i32 502, i32 510>
17204 %strided.vec7 = shufflevector <512 x i16> %wide.vec, <512 x i16> poison, <64 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127, i32 135, i32 143, i32 151, i32 159, i32 167, i32 175, i32 183, i32 191, i32 199, i32 207, i32 215, i32 223, i32 231, i32 239, i32 247, i32 255, i32 263, i32 271, i32 279, i32 287, i32 295, i32 303, i32 311, i32 319, i32 327, i32 335, i32 343, i32 351, i32 359, i32 367, i32 375, i32 383, i32 391, i32 399, i32 407, i32 415, i32 423, i32 431, i32 439, i32 447, i32 455, i32 463, i32 471, i32 479, i32 487, i32 495, i32 503, i32 511>
17205 store <64 x i16> %strided.vec0, ptr %out.vec0, align 64
17206 store <64 x i16> %strided.vec1, ptr %out.vec1, align 64
17207 store <64 x i16> %strided.vec2, ptr %out.vec2, align 64
17208 store <64 x i16> %strided.vec3, ptr %out.vec3, align 64
17209 store <64 x i16> %strided.vec4, ptr %out.vec4, align 64
17210 store <64 x i16> %strided.vec5, ptr %out.vec5, align 64
17211 store <64 x i16> %strided.vec6, ptr %out.vec6, align 64
17212 store <64 x i16> %strided.vec7, ptr %out.vec7, align 64