1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
16 ; These patterns are produced by LoopVectorizer for interleaved loads.
18 define void @load_i16_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
19 ; SSE-LABEL: load_i16_stride8_vf2:
21 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
22 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
23 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
24 ; SSE-NEXT: movdqa (%rdi), %xmm0
25 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
26 ; SSE-NEXT: movdqa %xmm0, %xmm2
27 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
28 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
29 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[2,3,2,3]
30 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm2[3,3,3,3]
31 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
32 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
33 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm0[2,3,2,3]
34 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
35 ; SSE-NEXT: movd %xmm2, (%rsi)
36 ; SSE-NEXT: movd %xmm3, (%rdx)
37 ; SSE-NEXT: movd %xmm4, (%rcx)
38 ; SSE-NEXT: movd %xmm5, (%r8)
39 ; SSE-NEXT: movd %xmm0, (%r9)
40 ; SSE-NEXT: movd %xmm1, (%r11)
41 ; SSE-NEXT: movd %xmm6, (%r10)
42 ; SSE-NEXT: movd %xmm7, (%rax)
45 ; AVX-LABEL: load_i16_stride8_vf2:
47 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
48 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
49 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11
50 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
51 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
52 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
53 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
54 ; AVX-NEXT: vmovd %xmm2, (%rsi)
55 ; AVX-NEXT: vpextrd $1, %xmm2, (%rdx)
56 ; AVX-NEXT: vpextrd $2, %xmm2, (%rcx)
57 ; AVX-NEXT: vpextrd $3, %xmm2, (%r8)
58 ; AVX-NEXT: vmovd %xmm0, (%r9)
59 ; AVX-NEXT: vpextrd $1, %xmm0, (%r11)
60 ; AVX-NEXT: vpextrd $2, %xmm0, (%r10)
61 ; AVX-NEXT: vpextrd $3, %xmm0, (%rax)
64 ; AVX2-LABEL: load_i16_stride8_vf2:
66 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
67 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
68 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
69 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
70 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
71 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
72 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
73 ; AVX2-NEXT: vmovd %xmm2, (%rsi)
74 ; AVX2-NEXT: vpextrd $1, %xmm2, (%rdx)
75 ; AVX2-NEXT: vpextrd $2, %xmm2, (%rcx)
76 ; AVX2-NEXT: vpextrd $3, %xmm2, (%r8)
77 ; AVX2-NEXT: vmovd %xmm0, (%r9)
78 ; AVX2-NEXT: vpextrd $1, %xmm0, (%r11)
79 ; AVX2-NEXT: vpextrd $2, %xmm0, (%r10)
80 ; AVX2-NEXT: vpextrd $3, %xmm0, (%rax)
83 ; AVX2-FP-LABEL: load_i16_stride8_vf2:
85 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
86 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
87 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11
88 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
89 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
90 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
91 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
92 ; AVX2-FP-NEXT: vmovd %xmm2, (%rsi)
93 ; AVX2-FP-NEXT: vpextrd $1, %xmm2, (%rdx)
94 ; AVX2-FP-NEXT: vpextrd $2, %xmm2, (%rcx)
95 ; AVX2-FP-NEXT: vpextrd $3, %xmm2, (%r8)
96 ; AVX2-FP-NEXT: vmovd %xmm0, (%r9)
97 ; AVX2-FP-NEXT: vpextrd $1, %xmm0, (%r11)
98 ; AVX2-FP-NEXT: vpextrd $2, %xmm0, (%r10)
99 ; AVX2-FP-NEXT: vpextrd $3, %xmm0, (%rax)
102 ; AVX2-FCP-LABEL: load_i16_stride8_vf2:
104 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
105 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
106 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
107 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
108 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
109 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
110 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
111 ; AVX2-FCP-NEXT: vmovd %xmm2, (%rsi)
112 ; AVX2-FCP-NEXT: vpextrd $1, %xmm2, (%rdx)
113 ; AVX2-FCP-NEXT: vpextrd $2, %xmm2, (%rcx)
114 ; AVX2-FCP-NEXT: vpextrd $3, %xmm2, (%r8)
115 ; AVX2-FCP-NEXT: vmovd %xmm0, (%r9)
116 ; AVX2-FCP-NEXT: vpextrd $1, %xmm0, (%r11)
117 ; AVX2-FCP-NEXT: vpextrd $2, %xmm0, (%r10)
118 ; AVX2-FCP-NEXT: vpextrd $3, %xmm0, (%rax)
119 ; AVX2-FCP-NEXT: retq
121 ; AVX512-LABEL: load_i16_stride8_vf2:
123 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
124 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
125 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
126 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
127 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
128 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
129 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
130 ; AVX512-NEXT: vmovd %xmm2, (%rsi)
131 ; AVX512-NEXT: vpextrd $1, %xmm2, (%rdx)
132 ; AVX512-NEXT: vpextrd $2, %xmm2, (%rcx)
133 ; AVX512-NEXT: vpextrd $3, %xmm2, (%r8)
134 ; AVX512-NEXT: vmovd %xmm0, (%r9)
135 ; AVX512-NEXT: vpextrd $1, %xmm0, (%r11)
136 ; AVX512-NEXT: vpextrd $2, %xmm0, (%r10)
137 ; AVX512-NEXT: vpextrd $3, %xmm0, (%rax)
140 ; AVX512-FCP-LABEL: load_i16_stride8_vf2:
141 ; AVX512-FCP: # %bb.0:
142 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
143 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
144 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
145 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
146 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
147 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
148 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
149 ; AVX512-FCP-NEXT: vmovd %xmm2, (%rsi)
150 ; AVX512-FCP-NEXT: vpextrd $1, %xmm2, (%rdx)
151 ; AVX512-FCP-NEXT: vpextrd $2, %xmm2, (%rcx)
152 ; AVX512-FCP-NEXT: vpextrd $3, %xmm2, (%r8)
153 ; AVX512-FCP-NEXT: vmovd %xmm0, (%r9)
154 ; AVX512-FCP-NEXT: vpextrd $1, %xmm0, (%r11)
155 ; AVX512-FCP-NEXT: vpextrd $2, %xmm0, (%r10)
156 ; AVX512-FCP-NEXT: vpextrd $3, %xmm0, (%rax)
157 ; AVX512-FCP-NEXT: retq
159 ; AVX512DQ-LABEL: load_i16_stride8_vf2:
161 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
162 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
163 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
164 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
165 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
166 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
167 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
168 ; AVX512DQ-NEXT: vmovd %xmm2, (%rsi)
169 ; AVX512DQ-NEXT: vpextrd $1, %xmm2, (%rdx)
170 ; AVX512DQ-NEXT: vpextrd $2, %xmm2, (%rcx)
171 ; AVX512DQ-NEXT: vpextrd $3, %xmm2, (%r8)
172 ; AVX512DQ-NEXT: vmovd %xmm0, (%r9)
173 ; AVX512DQ-NEXT: vpextrd $1, %xmm0, (%r11)
174 ; AVX512DQ-NEXT: vpextrd $2, %xmm0, (%r10)
175 ; AVX512DQ-NEXT: vpextrd $3, %xmm0, (%rax)
176 ; AVX512DQ-NEXT: retq
178 ; AVX512DQ-FCP-LABEL: load_i16_stride8_vf2:
179 ; AVX512DQ-FCP: # %bb.0:
180 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
181 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
182 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
183 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
184 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
185 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
186 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
187 ; AVX512DQ-FCP-NEXT: vmovd %xmm2, (%rsi)
188 ; AVX512DQ-FCP-NEXT: vpextrd $1, %xmm2, (%rdx)
189 ; AVX512DQ-FCP-NEXT: vpextrd $2, %xmm2, (%rcx)
190 ; AVX512DQ-FCP-NEXT: vpextrd $3, %xmm2, (%r8)
191 ; AVX512DQ-FCP-NEXT: vmovd %xmm0, (%r9)
192 ; AVX512DQ-FCP-NEXT: vpextrd $1, %xmm0, (%r11)
193 ; AVX512DQ-FCP-NEXT: vpextrd $2, %xmm0, (%r10)
194 ; AVX512DQ-FCP-NEXT: vpextrd $3, %xmm0, (%rax)
195 ; AVX512DQ-FCP-NEXT: retq
197 ; AVX512BW-LABEL: load_i16_stride8_vf2:
199 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
200 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
201 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
202 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
203 ; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm1
204 ; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
205 ; AVX512BW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
206 ; AVX512BW-NEXT: vmovd %xmm2, (%rsi)
207 ; AVX512BW-NEXT: vpextrd $1, %xmm2, (%rdx)
208 ; AVX512BW-NEXT: vpextrd $2, %xmm2, (%rcx)
209 ; AVX512BW-NEXT: vpextrd $3, %xmm2, (%r8)
210 ; AVX512BW-NEXT: vmovd %xmm0, (%r9)
211 ; AVX512BW-NEXT: vpextrd $1, %xmm0, (%r11)
212 ; AVX512BW-NEXT: vpextrd $2, %xmm0, (%r10)
213 ; AVX512BW-NEXT: vpextrd $3, %xmm0, (%rax)
214 ; AVX512BW-NEXT: retq
216 ; AVX512BW-FCP-LABEL: load_i16_stride8_vf2:
217 ; AVX512BW-FCP: # %bb.0:
218 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
219 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
220 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
221 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
222 ; AVX512BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
223 ; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
224 ; AVX512BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
225 ; AVX512BW-FCP-NEXT: vmovd %xmm2, (%rsi)
226 ; AVX512BW-FCP-NEXT: vpextrd $1, %xmm2, (%rdx)
227 ; AVX512BW-FCP-NEXT: vpextrd $2, %xmm2, (%rcx)
228 ; AVX512BW-FCP-NEXT: vpextrd $3, %xmm2, (%r8)
229 ; AVX512BW-FCP-NEXT: vmovd %xmm0, (%r9)
230 ; AVX512BW-FCP-NEXT: vpextrd $1, %xmm0, (%r11)
231 ; AVX512BW-FCP-NEXT: vpextrd $2, %xmm0, (%r10)
232 ; AVX512BW-FCP-NEXT: vpextrd $3, %xmm0, (%rax)
233 ; AVX512BW-FCP-NEXT: retq
235 ; AVX512DQ-BW-LABEL: load_i16_stride8_vf2:
236 ; AVX512DQ-BW: # %bb.0:
237 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
238 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
239 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
240 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
241 ; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm1
242 ; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
243 ; AVX512DQ-BW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
244 ; AVX512DQ-BW-NEXT: vmovd %xmm2, (%rsi)
245 ; AVX512DQ-BW-NEXT: vpextrd $1, %xmm2, (%rdx)
246 ; AVX512DQ-BW-NEXT: vpextrd $2, %xmm2, (%rcx)
247 ; AVX512DQ-BW-NEXT: vpextrd $3, %xmm2, (%r8)
248 ; AVX512DQ-BW-NEXT: vmovd %xmm0, (%r9)
249 ; AVX512DQ-BW-NEXT: vpextrd $1, %xmm0, (%r11)
250 ; AVX512DQ-BW-NEXT: vpextrd $2, %xmm0, (%r10)
251 ; AVX512DQ-BW-NEXT: vpextrd $3, %xmm0, (%rax)
252 ; AVX512DQ-BW-NEXT: retq
254 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride8_vf2:
255 ; AVX512DQ-BW-FCP: # %bb.0:
256 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
257 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
258 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
259 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
260 ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
261 ; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
262 ; AVX512DQ-BW-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
263 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm2, (%rsi)
264 ; AVX512DQ-BW-FCP-NEXT: vpextrd $1, %xmm2, (%rdx)
265 ; AVX512DQ-BW-FCP-NEXT: vpextrd $2, %xmm2, (%rcx)
266 ; AVX512DQ-BW-FCP-NEXT: vpextrd $3, %xmm2, (%r8)
267 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm0, (%r9)
268 ; AVX512DQ-BW-FCP-NEXT: vpextrd $1, %xmm0, (%r11)
269 ; AVX512DQ-BW-FCP-NEXT: vpextrd $2, %xmm0, (%r10)
270 ; AVX512DQ-BW-FCP-NEXT: vpextrd $3, %xmm0, (%rax)
271 ; AVX512DQ-BW-FCP-NEXT: retq
272 %wide.vec = load <16 x i16>, ptr %in.vec, align 64
273 %strided.vec0 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <2 x i32> <i32 0, i32 8>
274 %strided.vec1 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <2 x i32> <i32 1, i32 9>
275 %strided.vec2 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <2 x i32> <i32 2, i32 10>
276 %strided.vec3 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <2 x i32> <i32 3, i32 11>
277 %strided.vec4 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <2 x i32> <i32 4, i32 12>
278 %strided.vec5 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <2 x i32> <i32 5, i32 13>
279 %strided.vec6 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <2 x i32> <i32 6, i32 14>
280 %strided.vec7 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <2 x i32> <i32 7, i32 15>
281 store <2 x i16> %strided.vec0, ptr %out.vec0, align 64
282 store <2 x i16> %strided.vec1, ptr %out.vec1, align 64
283 store <2 x i16> %strided.vec2, ptr %out.vec2, align 64
284 store <2 x i16> %strided.vec3, ptr %out.vec3, align 64
285 store <2 x i16> %strided.vec4, ptr %out.vec4, align 64
286 store <2 x i16> %strided.vec5, ptr %out.vec5, align 64
287 store <2 x i16> %strided.vec6, ptr %out.vec6, align 64
288 store <2 x i16> %strided.vec7, ptr %out.vec7, align 64
292 define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
293 ; SSE-LABEL: load_i16_stride8_vf4:
295 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
296 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
297 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
298 ; SSE-NEXT: movdqa (%rdi), %xmm0
299 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
300 ; SSE-NEXT: movdqa 32(%rdi), %xmm2
301 ; SSE-NEXT: movdqa 48(%rdi), %xmm3
302 ; SSE-NEXT: movdqa %xmm2, %xmm4
303 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
304 ; SSE-NEXT: movdqa %xmm0, %xmm5
305 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
306 ; SSE-NEXT: movdqa %xmm5, %xmm6
307 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
308 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm4[1,1,1,1]
309 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm5[1,1,1,1]
310 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
311 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm5[3,3,3,3]
312 ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
313 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,3,3,3]
314 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
315 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
316 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
317 ; SSE-NEXT: movdqa %xmm0, %xmm1
318 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
319 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
320 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,1,1]
321 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
322 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
323 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
324 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
325 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
326 ; SSE-NEXT: movq %xmm6, (%rsi)
327 ; SSE-NEXT: movq %xmm8, (%rdx)
328 ; SSE-NEXT: movq %xmm5, (%rcx)
329 ; SSE-NEXT: movq %xmm7, (%r8)
330 ; SSE-NEXT: movq %xmm1, (%r9)
331 ; SSE-NEXT: movq %xmm4, (%r11)
332 ; SSE-NEXT: movq %xmm0, (%r10)
333 ; SSE-NEXT: movq %xmm3, (%rax)
336 ; AVX-LABEL: load_i16_stride8_vf4:
338 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
339 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
340 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11
341 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
342 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
343 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2
344 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm3
345 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
346 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
347 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
348 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
349 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm4[2,3],xmm7[4,5,6,7]
350 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
351 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
352 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
353 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
354 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
355 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
356 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
357 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
358 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7]
359 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
360 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
361 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
362 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5,6,7]
363 ; AVX-NEXT: vmovq %xmm6, (%rsi)
364 ; AVX-NEXT: vmovq %xmm7, (%rdx)
365 ; AVX-NEXT: vmovq %xmm8, (%rcx)
366 ; AVX-NEXT: vmovq %xmm4, (%r8)
367 ; AVX-NEXT: vmovq %xmm1, (%r9)
368 ; AVX-NEXT: vmovq %xmm3, (%r11)
369 ; AVX-NEXT: vmovq %xmm5, (%r10)
370 ; AVX-NEXT: vmovq %xmm0, (%rax)
373 ; AVX2-LABEL: load_i16_stride8_vf4:
375 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
376 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
377 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
378 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
379 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
380 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm2
381 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm3
382 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
383 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
384 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
385 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
386 ; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
387 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
388 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
389 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
390 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
391 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
392 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
393 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
394 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
395 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
396 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
397 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
398 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
399 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
400 ; AVX2-NEXT: vmovq %xmm6, (%rsi)
401 ; AVX2-NEXT: vmovq %xmm7, (%rdx)
402 ; AVX2-NEXT: vmovq %xmm8, (%rcx)
403 ; AVX2-NEXT: vmovq %xmm4, (%r8)
404 ; AVX2-NEXT: vmovq %xmm1, (%r9)
405 ; AVX2-NEXT: vmovq %xmm3, (%r11)
406 ; AVX2-NEXT: vmovq %xmm5, (%r10)
407 ; AVX2-NEXT: vmovq %xmm0, (%rax)
410 ; AVX2-FP-LABEL: load_i16_stride8_vf4:
412 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
413 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
414 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11
415 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
416 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
417 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm2
418 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm3
419 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
420 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
421 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
422 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
423 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
424 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
425 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
426 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
427 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
428 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
429 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
430 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
431 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
432 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
433 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
434 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
435 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
436 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
437 ; AVX2-FP-NEXT: vmovq %xmm6, (%rsi)
438 ; AVX2-FP-NEXT: vmovq %xmm7, (%rdx)
439 ; AVX2-FP-NEXT: vmovq %xmm8, (%rcx)
440 ; AVX2-FP-NEXT: vmovq %xmm4, (%r8)
441 ; AVX2-FP-NEXT: vmovq %xmm1, (%r9)
442 ; AVX2-FP-NEXT: vmovq %xmm3, (%r11)
443 ; AVX2-FP-NEXT: vmovq %xmm5, (%r10)
444 ; AVX2-FP-NEXT: vmovq %xmm0, (%rax)
447 ; AVX2-FCP-LABEL: load_i16_stride8_vf4:
449 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
450 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
451 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
452 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
453 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
454 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
455 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
456 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
457 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
458 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
459 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
460 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
461 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
462 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
463 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
464 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
465 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
466 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
467 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
468 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
469 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
470 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
471 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
472 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
473 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
474 ; AVX2-FCP-NEXT: vmovq %xmm6, (%rsi)
475 ; AVX2-FCP-NEXT: vmovq %xmm7, (%rdx)
476 ; AVX2-FCP-NEXT: vmovq %xmm8, (%rcx)
477 ; AVX2-FCP-NEXT: vmovq %xmm4, (%r8)
478 ; AVX2-FCP-NEXT: vmovq %xmm1, (%r9)
479 ; AVX2-FCP-NEXT: vmovq %xmm3, (%r11)
480 ; AVX2-FCP-NEXT: vmovq %xmm5, (%r10)
481 ; AVX2-FCP-NEXT: vmovq %xmm0, (%rax)
482 ; AVX2-FCP-NEXT: retq
484 ; AVX512-LABEL: load_i16_stride8_vf4:
486 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
487 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
488 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
489 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
490 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
491 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2
492 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm3
493 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
494 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
495 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
496 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
497 ; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
498 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
499 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm9 = [3,7,3,3]
500 ; AVX512-NEXT: vpermt2d %xmm4, %xmm9, %xmm5
501 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
502 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
503 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
504 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
505 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
506 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
507 ; AVX512-NEXT: vpermt2d %xmm2, %xmm9, %xmm0
508 ; AVX512-NEXT: vmovq %xmm6, (%rsi)
509 ; AVX512-NEXT: vmovq %xmm7, (%rdx)
510 ; AVX512-NEXT: vmovq %xmm8, (%rcx)
511 ; AVX512-NEXT: vmovq %xmm5, (%r8)
512 ; AVX512-NEXT: vmovq %xmm1, (%r9)
513 ; AVX512-NEXT: vmovq %xmm3, (%r11)
514 ; AVX512-NEXT: vmovq %xmm4, (%r10)
515 ; AVX512-NEXT: vmovq %xmm0, (%rax)
518 ; AVX512-FCP-LABEL: load_i16_stride8_vf4:
519 ; AVX512-FCP: # %bb.0:
520 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
521 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
522 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
523 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
524 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
525 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
526 ; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
527 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
528 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
529 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
530 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,5,1,1]
531 ; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm8
532 ; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm8
533 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
534 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,3,3]
535 ; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm10, %xmm5
536 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
537 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
538 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
539 ; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm7
540 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
541 ; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm10, %xmm0
542 ; AVX512-FCP-NEXT: vmovq %xmm6, (%rsi)
543 ; AVX512-FCP-NEXT: vmovq %xmm8, (%rdx)
544 ; AVX512-FCP-NEXT: vmovq %xmm9, (%rcx)
545 ; AVX512-FCP-NEXT: vmovq %xmm5, (%r8)
546 ; AVX512-FCP-NEXT: vmovq %xmm1, (%r9)
547 ; AVX512-FCP-NEXT: vmovq %xmm7, (%r11)
548 ; AVX512-FCP-NEXT: vmovq %xmm3, (%r10)
549 ; AVX512-FCP-NEXT: vmovq %xmm0, (%rax)
550 ; AVX512-FCP-NEXT: retq
552 ; AVX512DQ-LABEL: load_i16_stride8_vf4:
554 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
555 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10
556 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r11
557 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
558 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
559 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2
560 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm3
561 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
562 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
563 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
564 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[1,1,1,1]
565 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm7[0],xmm4[1],xmm7[2,3]
566 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
567 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm9 = [3,7,3,3]
568 ; AVX512DQ-NEXT: vpermt2d %xmm4, %xmm9, %xmm5
569 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
570 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
571 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
572 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
573 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm2[1],xmm3[2,3]
574 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
575 ; AVX512DQ-NEXT: vpermt2d %xmm2, %xmm9, %xmm0
576 ; AVX512DQ-NEXT: vmovq %xmm6, (%rsi)
577 ; AVX512DQ-NEXT: vmovq %xmm7, (%rdx)
578 ; AVX512DQ-NEXT: vmovq %xmm8, (%rcx)
579 ; AVX512DQ-NEXT: vmovq %xmm5, (%r8)
580 ; AVX512DQ-NEXT: vmovq %xmm1, (%r9)
581 ; AVX512DQ-NEXT: vmovq %xmm3, (%r11)
582 ; AVX512DQ-NEXT: vmovq %xmm4, (%r10)
583 ; AVX512DQ-NEXT: vmovq %xmm0, (%rax)
584 ; AVX512DQ-NEXT: retq
586 ; AVX512DQ-FCP-LABEL: load_i16_stride8_vf4:
587 ; AVX512DQ-FCP: # %bb.0:
588 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
589 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
590 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
591 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
592 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
593 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
594 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
595 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
596 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
597 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
598 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [1,5,1,1]
599 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm8
600 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm8
601 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
602 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,3,3]
603 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm10, %xmm5
604 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
605 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
606 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
607 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm0, %xmm7
608 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
609 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm10, %xmm0
610 ; AVX512DQ-FCP-NEXT: vmovq %xmm6, (%rsi)
611 ; AVX512DQ-FCP-NEXT: vmovq %xmm8, (%rdx)
612 ; AVX512DQ-FCP-NEXT: vmovq %xmm9, (%rcx)
613 ; AVX512DQ-FCP-NEXT: vmovq %xmm5, (%r8)
614 ; AVX512DQ-FCP-NEXT: vmovq %xmm1, (%r9)
615 ; AVX512DQ-FCP-NEXT: vmovq %xmm7, (%r11)
616 ; AVX512DQ-FCP-NEXT: vmovq %xmm3, (%r10)
617 ; AVX512DQ-FCP-NEXT: vmovq %xmm0, (%rax)
618 ; AVX512DQ-FCP-NEXT: retq
620 ; AVX512BW-LABEL: load_i16_stride8_vf4:
622 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
623 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
624 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
625 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
626 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
627 ; AVX512BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
628 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0]
629 ; AVX512BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
630 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0]
631 ; AVX512BW-NEXT: vpermw %zmm1, %zmm3, %zmm3
632 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0]
633 ; AVX512BW-NEXT: vpermw %zmm1, %zmm4, %zmm4
634 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0]
635 ; AVX512BW-NEXT: vpermw %zmm1, %zmm5, %zmm5
636 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0]
637 ; AVX512BW-NEXT: vpermw %zmm1, %zmm6, %zmm6
638 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0]
639 ; AVX512BW-NEXT: vpermw %zmm1, %zmm7, %zmm7
640 ; AVX512BW-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0]
641 ; AVX512BW-NEXT: vpermw %zmm1, %zmm8, %zmm1
642 ; AVX512BW-NEXT: vmovq %xmm0, (%rsi)
643 ; AVX512BW-NEXT: vmovq %xmm2, (%rdx)
644 ; AVX512BW-NEXT: vmovq %xmm3, (%rcx)
645 ; AVX512BW-NEXT: vmovq %xmm4, (%r8)
646 ; AVX512BW-NEXT: vmovq %xmm5, (%r9)
647 ; AVX512BW-NEXT: vmovq %xmm6, (%r11)
648 ; AVX512BW-NEXT: vmovq %xmm7, (%r10)
649 ; AVX512BW-NEXT: vmovq %xmm1, (%rax)
650 ; AVX512BW-NEXT: vzeroupper
651 ; AVX512BW-NEXT: retq
653 ; AVX512BW-FCP-LABEL: load_i16_stride8_vf4:
654 ; AVX512BW-FCP: # %bb.0:
655 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
656 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
657 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
658 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
659 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
660 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
661 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0]
662 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
663 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0]
664 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3
665 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0]
666 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4
667 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0]
668 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5
669 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0]
670 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6
671 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0]
672 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm7
673 ; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0]
674 ; AVX512BW-FCP-NEXT: vpermw %zmm1, %zmm8, %zmm1
675 ; AVX512BW-FCP-NEXT: vmovq %xmm0, (%rsi)
676 ; AVX512BW-FCP-NEXT: vmovq %xmm2, (%rdx)
677 ; AVX512BW-FCP-NEXT: vmovq %xmm3, (%rcx)
678 ; AVX512BW-FCP-NEXT: vmovq %xmm4, (%r8)
679 ; AVX512BW-FCP-NEXT: vmovq %xmm5, (%r9)
680 ; AVX512BW-FCP-NEXT: vmovq %xmm6, (%r11)
681 ; AVX512BW-FCP-NEXT: vmovq %xmm7, (%r10)
682 ; AVX512BW-FCP-NEXT: vmovq %xmm1, (%rax)
683 ; AVX512BW-FCP-NEXT: vzeroupper
684 ; AVX512BW-FCP-NEXT: retq
686 ; AVX512DQ-BW-LABEL: load_i16_stride8_vf4:
687 ; AVX512DQ-BW: # %bb.0:
688 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
689 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
690 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
691 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
692 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1
693 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm0, %zmm0
694 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0]
695 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm2, %zmm2
696 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0]
697 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm3, %zmm3
698 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0]
699 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm4, %zmm4
700 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0]
701 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm5, %zmm5
702 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0]
703 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm6, %zmm6
704 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0]
705 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm7, %zmm7
706 ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0]
707 ; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm8, %zmm1
708 ; AVX512DQ-BW-NEXT: vmovq %xmm0, (%rsi)
709 ; AVX512DQ-BW-NEXT: vmovq %xmm2, (%rdx)
710 ; AVX512DQ-BW-NEXT: vmovq %xmm3, (%rcx)
711 ; AVX512DQ-BW-NEXT: vmovq %xmm4, (%r8)
712 ; AVX512DQ-BW-NEXT: vmovq %xmm5, (%r9)
713 ; AVX512DQ-BW-NEXT: vmovq %xmm6, (%r11)
714 ; AVX512DQ-BW-NEXT: vmovq %xmm7, (%r10)
715 ; AVX512DQ-BW-NEXT: vmovq %xmm1, (%rax)
716 ; AVX512DQ-BW-NEXT: vzeroupper
717 ; AVX512DQ-BW-NEXT: retq
719 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride8_vf4:
720 ; AVX512DQ-BW-FCP: # %bb.0:
721 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
722 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
723 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
724 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
725 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
726 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm0, %zmm0
727 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = [1,9,17,25,0,0,0,0]
728 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm2, %zmm2
729 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm3 = [2,10,18,26,0,0,0,0]
730 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm3, %zmm3
731 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm4 = [3,11,19,27,0,0,0,0]
732 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm4, %zmm4
733 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm5 = [4,12,20,28,0,0,0,0]
734 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm5, %zmm5
735 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm6 = [5,13,21,29,0,0,0,0]
736 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm6, %zmm6
737 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm7 = [6,14,22,30,0,0,0,0]
738 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm7, %zmm7
739 ; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm8 = [7,15,23,31,0,0,0,0]
740 ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm1, %zmm8, %zmm1
741 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm0, (%rsi)
742 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm2, (%rdx)
743 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm3, (%rcx)
744 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm4, (%r8)
745 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm5, (%r9)
746 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm6, (%r11)
747 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm7, (%r10)
748 ; AVX512DQ-BW-FCP-NEXT: vmovq %xmm1, (%rax)
749 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
750 ; AVX512DQ-BW-FCP-NEXT: retq
751 %wide.vec = load <32 x i16>, ptr %in.vec, align 64
752 %strided.vec0 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
753 %strided.vec1 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <4 x i32> <i32 1, i32 9, i32 17, i32 25>
754 %strided.vec2 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
755 %strided.vec3 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
756 %strided.vec4 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <4 x i32> <i32 4, i32 12, i32 20, i32 28>
757 %strided.vec5 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <4 x i32> <i32 5, i32 13, i32 21, i32 29>
758 %strided.vec6 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <4 x i32> <i32 6, i32 14, i32 22, i32 30>
759 %strided.vec7 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <4 x i32> <i32 7, i32 15, i32 23, i32 31>
760 store <4 x i16> %strided.vec0, ptr %out.vec0, align 64
761 store <4 x i16> %strided.vec1, ptr %out.vec1, align 64
762 store <4 x i16> %strided.vec2, ptr %out.vec2, align 64
763 store <4 x i16> %strided.vec3, ptr %out.vec3, align 64
764 store <4 x i16> %strided.vec4, ptr %out.vec4, align 64
765 store <4 x i16> %strided.vec5, ptr %out.vec5, align 64
766 store <4 x i16> %strided.vec6, ptr %out.vec6, align 64
767 store <4 x i16> %strided.vec7, ptr %out.vec7, align 64
771 define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
772 ; SSE-LABEL: load_i16_stride8_vf8:
774 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
775 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
776 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
777 ; SSE-NEXT: movdqa (%rdi), %xmm0
778 ; SSE-NEXT: movdqa 16(%rdi), %xmm7
779 ; SSE-NEXT: movdqa 32(%rdi), %xmm2
780 ; SSE-NEXT: movdqa 48(%rdi), %xmm9
781 ; SSE-NEXT: movdqa 80(%rdi), %xmm10
782 ; SSE-NEXT: movdqa 64(%rdi), %xmm1
783 ; SSE-NEXT: movdqa 112(%rdi), %xmm11
784 ; SSE-NEXT: movdqa 96(%rdi), %xmm3
785 ; SSE-NEXT: movdqa %xmm3, %xmm13
786 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
787 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm13[0,0,0,0]
788 ; SSE-NEXT: movdqa %xmm1, %xmm12
789 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
790 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm12[0,0,0,0]
791 ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm4[2],xmm6[3],xmm4[3]
792 ; SSE-NEXT: movdqa %xmm2, %xmm14
793 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3]
794 ; SSE-NEXT: movdqa %xmm0, %xmm4
795 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3]
796 ; SSE-NEXT: movdqa %xmm4, %xmm5
797 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1]
798 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
799 ; SSE-NEXT: movdqa %xmm12, %xmm8
800 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1]
801 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[1,1,1,1]
802 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm4[1,1,1,1]
803 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1]
804 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3]
805 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm13[2,2,2,2]
806 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm12[2,2,2,2]
807 ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm8[2],xmm15[3],xmm8[3]
808 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[3,3,3,3]
809 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm14[2],xmm4[3],xmm14[3]
810 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm15[2,3]
811 ; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3]
812 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm14[3,3,3,3]
813 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm13[0],xmm8[1],xmm13[1]
814 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm12[2,3]
815 ; SSE-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
816 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm3[0,0,0,0]
817 ; SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm10[4],xmm1[5],xmm10[5],xmm1[6],xmm10[6],xmm1[7],xmm10[7]
818 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,0,0,0]
819 ; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm11[2],xmm10[3],xmm11[3]
820 ; SSE-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
821 ; SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
822 ; SSE-NEXT: movdqa %xmm0, %xmm7
823 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
824 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm10[2,3]
825 ; SSE-NEXT: movdqa %xmm1, %xmm9
826 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1]
827 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[1,1,1,1]
828 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[1,1,1,1]
829 ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
830 ; SSE-NEXT: shufps {{.*#+}} xmm11 = xmm11[0,1],xmm9[2,3]
831 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[2,2,2,2]
832 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[2,2,2,2]
833 ; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
834 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[3,3,3,3]
835 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
836 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3]
837 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
838 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
839 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1]
840 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm1[2,3]
841 ; SSE-NEXT: movaps %xmm5, (%rsi)
842 ; SSE-NEXT: movaps %xmm6, (%rdx)
843 ; SSE-NEXT: movaps %xmm4, (%rcx)
844 ; SSE-NEXT: movaps %xmm8, (%r8)
845 ; SSE-NEXT: movaps %xmm7, (%r9)
846 ; SSE-NEXT: movaps %xmm11, (%r11)
847 ; SSE-NEXT: movaps %xmm0, (%r10)
848 ; SSE-NEXT: movaps %xmm9, (%rax)
851 ; AVX-LABEL: load_i16_stride8_vf8:
853 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
854 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
855 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r11
856 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm2
857 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm3
858 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
859 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,0,0,0]
860 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm5
861 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm6
862 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
863 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1]
864 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
865 ; AVX-NEXT: vmovdqa (%rdi), %xmm8
866 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm9
867 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm10
868 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm11
869 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
870 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
871 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
872 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
873 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
874 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,1,1,1]
875 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm12[2,3],xmm14[4,5,6,7]
876 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3],xmm1[4,5,6,7]
877 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,2,2,2]
878 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm7[0,1,2,3,4,5],xmm14[6,7]
879 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
880 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4,5,6,7]
881 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm7[2],xmm4[2],xmm7[3],xmm4[3]
882 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[2,3,2,3]
883 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[3,3,3,3]
884 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3],xmm12[4,5,6,7]
885 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4,5,6,7]
886 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
887 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,0,0,0]
888 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
889 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm5[0,1,0,1]
890 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm6[0,1,2,3,4,5],xmm3[6,7]
891 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
892 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
893 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
894 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2,3],xmm3[4,5,6,7]
895 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
896 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,1,1]
897 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm6[2,3],xmm9[4,5,6,7]
898 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1,2,3],xmm8[4,5,6,7]
899 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,2,2,2]
900 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm5[0,1,2,3,4,5],xmm9[6,7]
901 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
902 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7]
903 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm5[2],xmm2[2],xmm5[3],xmm2[3]
904 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
905 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3]
906 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
907 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
908 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
909 ; AVX-NEXT: vmovdqa %xmm1, (%rdx)
910 ; AVX-NEXT: vmovdqa %xmm14, (%rcx)
911 ; AVX-NEXT: vmovdqa %xmm4, (%r8)
912 ; AVX-NEXT: vmovdqa %xmm3, (%r9)
913 ; AVX-NEXT: vmovdqa %xmm8, (%r11)
914 ; AVX-NEXT: vmovdqa %xmm9, (%r10)
915 ; AVX-NEXT: vmovdqa %xmm2, (%rax)
918 ; AVX2-LABEL: load_i16_stride8_vf8:
920 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
921 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
922 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
923 ; AVX2-NEXT: vmovdqa 112(%rdi), %xmm2
924 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm3
925 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
926 ; AVX2-NEXT: vpbroadcastd %xmm4, %xmm0
927 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm5
928 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm6
929 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
930 ; AVX2-NEXT: vpbroadcastd %xmm7, %xmm1
931 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
932 ; AVX2-NEXT: vmovdqa (%rdi), %xmm8
933 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm9
934 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm10
935 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm11
936 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
937 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
938 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
939 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
940 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
941 ; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,1,1,1]
942 ; AVX2-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm12[1],xmm14[2,3]
943 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
944 ; AVX2-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,2,2,2]
945 ; AVX2-NEXT: vpblendd {{.*#+}} xmm14 = xmm7[0,1,2],xmm14[3]
946 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
947 ; AVX2-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
948 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm7[2],xmm4[2],xmm7[3],xmm4[3]
949 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[2,3,2,3]
950 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[3,3,3,3]
951 ; AVX2-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0],xmm7[1],xmm12[2,3]
952 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3]
953 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
954 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm3
955 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
956 ; AVX2-NEXT: vpbroadcastd %xmm5, %xmm6
957 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
958 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
959 ; AVX2-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
960 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
961 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
962 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
963 ; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,1,1]
964 ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm6[1],xmm9[2,3]
965 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
966 ; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,2,2,2]
967 ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm5[0,1,2],xmm9[3]
968 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
969 ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
970 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm5[2],xmm2[2],xmm5[3],xmm2[3]
971 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
972 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3]
973 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
974 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
975 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
976 ; AVX2-NEXT: vmovdqa %xmm1, (%rdx)
977 ; AVX2-NEXT: vmovdqa %xmm14, (%rcx)
978 ; AVX2-NEXT: vmovdqa %xmm4, (%r8)
979 ; AVX2-NEXT: vmovdqa %xmm3, (%r9)
980 ; AVX2-NEXT: vmovdqa %xmm8, (%r11)
981 ; AVX2-NEXT: vmovdqa %xmm9, (%r10)
982 ; AVX2-NEXT: vmovdqa %xmm2, (%rax)
985 ; AVX2-FP-LABEL: load_i16_stride8_vf8:
987 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
988 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r10
989 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %r11
990 ; AVX2-FP-NEXT: vmovdqa 112(%rdi), %xmm2
991 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm3
992 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
993 ; AVX2-FP-NEXT: vpbroadcastd %xmm4, %xmm0
994 ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm5
995 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm6
996 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
997 ; AVX2-FP-NEXT: vpbroadcastd %xmm7, %xmm1
998 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
999 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm8
1000 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm9
1001 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm10
1002 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm11
1003 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1004 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
1005 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
1006 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1007 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
1008 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,1,1,1]
1009 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm12[1],xmm14[2,3]
1010 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
1011 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,2,2,2]
1012 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm7[0,1,2],xmm14[3]
1013 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
1014 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
1015 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm7[2],xmm4[2],xmm7[3],xmm4[3]
1016 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[2,3,2,3]
1017 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[3,3,3,3]
1018 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0],xmm7[1],xmm12[2,3]
1019 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3]
1020 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
1021 ; AVX2-FP-NEXT: vpbroadcastd %xmm2, %xmm3
1022 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
1023 ; AVX2-FP-NEXT: vpbroadcastd %xmm5, %xmm6
1024 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
1025 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
1026 ; AVX2-FP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
1027 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
1028 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
1029 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
1030 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,1,1]
1031 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm6[1],xmm9[2,3]
1032 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
1033 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,2,2,2]
1034 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm5[0,1,2],xmm9[3]
1035 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1036 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
1037 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm5[2],xmm2[2],xmm5[3],xmm2[3]
1038 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
1039 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3]
1040 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
1041 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
1042 ; AVX2-FP-NEXT: vmovdqa %xmm0, (%rsi)
1043 ; AVX2-FP-NEXT: vmovdqa %xmm1, (%rdx)
1044 ; AVX2-FP-NEXT: vmovdqa %xmm14, (%rcx)
1045 ; AVX2-FP-NEXT: vmovdqa %xmm4, (%r8)
1046 ; AVX2-FP-NEXT: vmovdqa %xmm3, (%r9)
1047 ; AVX2-FP-NEXT: vmovdqa %xmm8, (%r11)
1048 ; AVX2-FP-NEXT: vmovdqa %xmm9, (%r10)
1049 ; AVX2-FP-NEXT: vmovdqa %xmm2, (%rax)
1050 ; AVX2-FP-NEXT: retq
1052 ; AVX2-FCP-LABEL: load_i16_stride8_vf8:
1053 ; AVX2-FCP: # %bb.0:
1054 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1055 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1056 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1057 ; AVX2-FCP-NEXT: vmovdqa 112(%rdi), %xmm2
1058 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm3
1059 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1060 ; AVX2-FCP-NEXT: vpbroadcastd %xmm4, %xmm0
1061 ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm5
1062 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm6
1063 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1064 ; AVX2-FCP-NEXT: vpbroadcastd %xmm7, %xmm1
1065 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1066 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm8
1067 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm9
1068 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm10
1069 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm11
1070 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
1071 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
1072 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
1073 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1074 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
1075 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm13[1,1,1,1]
1076 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm14[0],xmm12[1],xmm14[2,3]
1077 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
1078 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm4[2,2,2,2]
1079 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm7[0,1,2],xmm14[3]
1080 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
1081 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm15[0,1],xmm14[2,3]
1082 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm7[2],xmm4[2],xmm7[3],xmm4[3]
1083 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[2,3,2,3]
1084 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm13[3,3,3,3]
1085 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0],xmm7[1],xmm12[2,3]
1086 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3]
1087 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
1088 ; AVX2-FCP-NEXT: vpbroadcastd %xmm2, %xmm3
1089 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
1090 ; AVX2-FCP-NEXT: vpbroadcastd %xmm5, %xmm6
1091 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
1092 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
1093 ; AVX2-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
1094 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
1095 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
1096 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
1097 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,1,1]
1098 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm6[1],xmm9[2,3]
1099 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
1100 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm2[2,2,2,2]
1101 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm5[0,1,2],xmm9[3]
1102 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1103 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
1104 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm5[2],xmm2[2],xmm5[3],xmm2[3]
1105 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[2,3,2,3]
1106 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3]
1107 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
1108 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
1109 ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rsi)
1110 ; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rdx)
1111 ; AVX2-FCP-NEXT: vmovdqa %xmm14, (%rcx)
1112 ; AVX2-FCP-NEXT: vmovdqa %xmm4, (%r8)
1113 ; AVX2-FCP-NEXT: vmovdqa %xmm3, (%r9)
1114 ; AVX2-FCP-NEXT: vmovdqa %xmm8, (%r11)
1115 ; AVX2-FCP-NEXT: vmovdqa %xmm9, (%r10)
1116 ; AVX2-FCP-NEXT: vmovdqa %xmm2, (%rax)
1117 ; AVX2-FCP-NEXT: retq
1119 ; AVX512-LABEL: load_i16_stride8_vf8:
1121 ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm1
1122 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm2
1123 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1124 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm3
1125 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm4
1126 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1127 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4]
1128 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1129 ; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2]
1130 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3]
1131 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1132 ; AVX512-NEXT: vpermt2d %xmm5, %xmm0, %xmm6
1133 ; AVX512-NEXT: vmovdqa (%rdi), %xmm5
1134 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm10
1135 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm11
1136 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm12
1137 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
1138 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3]
1139 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
1140 ; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3]
1141 ; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,1,1]
1142 ; AVX512-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm13[1],xmm15[2,3]
1143 ; AVX512-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3]
1144 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
1145 ; AVX512-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3]
1146 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm15 = [3,7,0,0]
1147 ; AVX512-NEXT: vpermt2d %xmm13, %xmm15, %xmm14
1148 ; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
1149 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1150 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1151 ; AVX512-NEXT: vpermi2d %xmm1, %xmm2, %xmm0
1152 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
1153 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
1154 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
1155 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
1156 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1157 ; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[1,1,1,1]
1158 ; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm3[1],xmm10[2,3]
1159 ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3]
1160 ; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,2,2,2]
1161 ; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm2[0,1,2],xmm10[3]
1162 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1163 ; AVX512-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
1164 ; AVX512-NEXT: vpermt2d %xmm3, %xmm15, %xmm4
1165 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1166 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
1167 ; AVX512-NEXT: vmovdqa %xmm6, (%rsi)
1168 ; AVX512-NEXT: vmovdqa %xmm7, (%rdx)
1169 ; AVX512-NEXT: vmovdqa %xmm8, (%rcx)
1170 ; AVX512-NEXT: vmovdqa %xmm9, (%r8)
1171 ; AVX512-NEXT: vmovdqa %xmm0, (%r9)
1172 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
1173 ; AVX512-NEXT: vmovdqa %xmm5, (%rax)
1174 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
1175 ; AVX512-NEXT: vmovdqa %xmm10, (%rax)
1176 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
1177 ; AVX512-NEXT: vmovdqa %xmm1, (%rax)
1180 ; AVX512-FCP-LABEL: load_i16_stride8_vf8:
1181 ; AVX512-FCP: # %bb.0:
1182 ; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
1183 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm4
1184 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
1185 ; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm17
1186 ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm5
1187 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm6
1188 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1189 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,0,0,4]
1190 ; AVX512-FCP-NEXT: vmovdqa %xmm11, %xmm0
1191 ; AVX512-FCP-NEXT: vpermt2d %xmm10, %xmm3, %xmm0
1192 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm7
1193 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm8
1194 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm12
1195 ; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm13
1196 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
1197 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
1198 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
1199 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
1200 ; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm16
1201 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [1,5,0,0]
1202 ; AVX512-FCP-NEXT: vmovdqa %xmm15, %xmm2
1203 ; AVX512-FCP-NEXT: vpermt2d %xmm14, %xmm9, %xmm2
1204 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
1205 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
1206 ; AVX512-FCP-NEXT: vmovdqa %xmm11, %xmm0
1207 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
1208 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,2,6]
1209 ; AVX512-FCP-NEXT: vpermt2d %xmm10, %xmm1, %xmm0
1210 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
1211 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
1212 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,0,0]
1213 ; AVX512-FCP-NEXT: vpermt2d %xmm14, %xmm10, %xmm15
1214 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
1215 ; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm14
1216 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
1217 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
1218 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
1219 ; AVX512-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm3
1220 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
1221 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
1222 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
1223 ; AVX512-FCP-NEXT: vpermi2d %xmm6, %xmm7, %xmm9
1224 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
1225 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
1226 ; AVX512-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm1
1227 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1228 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
1229 ; AVX512-FCP-NEXT: vpermt2d %xmm6, %xmm10, %xmm7
1230 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
1231 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3]
1232 ; AVX512-FCP-NEXT: vmovdqa64 %xmm16, (%rsi)
1233 ; AVX512-FCP-NEXT: vmovdqa %xmm2, (%rdx)
1234 ; AVX512-FCP-NEXT: vmovdqa %xmm0, (%rcx)
1235 ; AVX512-FCP-NEXT: vmovdqa %xmm11, (%r8)
1236 ; AVX512-FCP-NEXT: vmovdqa %xmm3, (%r9)
1237 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1238 ; AVX512-FCP-NEXT: vmovdqa %xmm8, (%rax)
1239 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1240 ; AVX512-FCP-NEXT: vmovdqa %xmm1, (%rax)
1241 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1242 ; AVX512-FCP-NEXT: vmovdqa %xmm4, (%rax)
1243 ; AVX512-FCP-NEXT: retq
1245 ; AVX512DQ-LABEL: load_i16_stride8_vf8:
1246 ; AVX512DQ: # %bb.0:
1247 ; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm1
1248 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm2
1249 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1250 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm3
1251 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm4
1252 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1253 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4]
1254 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1255 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm5[2,2,2,2]
1256 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm6[0,1,2],xmm8[3]
1257 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1258 ; AVX512DQ-NEXT: vpermt2d %xmm5, %xmm0, %xmm6
1259 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm5
1260 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm10
1261 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm11
1262 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm12
1263 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
1264 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3]
1265 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
1266 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm15[0,1],xmm6[2,3]
1267 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm14[1,1,1,1]
1268 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0],xmm13[1],xmm15[2,3]
1269 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3]
1270 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
1271 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3]
1272 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm15 = [3,7,0,0]
1273 ; AVX512DQ-NEXT: vpermt2d %xmm13, %xmm15, %xmm14
1274 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
1275 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1276 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
1277 ; AVX512DQ-NEXT: vpermi2d %xmm1, %xmm2, %xmm0
1278 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
1279 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
1280 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
1281 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
1282 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1283 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[1,1,1,1]
1284 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm10[0],xmm3[1],xmm10[2,3]
1285 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm10[0,1],xmm5[2,3]
1286 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[2,2,2,2]
1287 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm2[0,1,2],xmm10[3]
1288 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
1289 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
1290 ; AVX512DQ-NEXT: vpermt2d %xmm3, %xmm15, %xmm4
1291 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1292 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
1293 ; AVX512DQ-NEXT: vmovdqa %xmm6, (%rsi)
1294 ; AVX512DQ-NEXT: vmovdqa %xmm7, (%rdx)
1295 ; AVX512DQ-NEXT: vmovdqa %xmm8, (%rcx)
1296 ; AVX512DQ-NEXT: vmovdqa %xmm9, (%r8)
1297 ; AVX512DQ-NEXT: vmovdqa %xmm0, (%r9)
1298 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
1299 ; AVX512DQ-NEXT: vmovdqa %xmm5, (%rax)
1300 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
1301 ; AVX512DQ-NEXT: vmovdqa %xmm10, (%rax)
1302 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
1303 ; AVX512DQ-NEXT: vmovdqa %xmm1, (%rax)
1304 ; AVX512DQ-NEXT: retq
1306 ; AVX512DQ-FCP-LABEL: load_i16_stride8_vf8:
1307 ; AVX512DQ-FCP: # %bb.0:
1308 ; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
1309 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm4
1310 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
1311 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm17
1312 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm5
1313 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm6
1314 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1315 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,0,0,4]
1316 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, %xmm0
1317 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm10, %xmm3, %xmm0
1318 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm7
1319 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm8
1320 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm12
1321 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm13
1322 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
1323 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
1324 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
1325 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
1326 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm16
1327 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm9 = [1,5,0,0]
1328 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm15, %xmm2
1329 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm14, %xmm9, %xmm2
1330 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
1331 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
1332 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, %xmm0
1333 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm11[2],xmm10[2],xmm11[3],xmm10[3]
1334 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,2,6]
1335 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm10, %xmm1, %xmm0
1336 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
1337 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
1338 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm10 = [3,7,0,0]
1339 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm14, %xmm10, %xmm15
1340 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
1341 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm14
1342 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
1343 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
1344 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
1345 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm3
1346 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
1347 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
1348 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
1349 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm6, %xmm7, %xmm9
1350 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
1351 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
1352 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm4, %xmm5, %xmm1
1353 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm9 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1354 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm9[0,1],xmm1[2,3]
1355 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm6, %xmm10, %xmm7
1356 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm5[2],xmm4[2],xmm5[3],xmm4[3]
1357 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm7[0,1],xmm4[2,3]
1358 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, (%rsi)
1359 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, (%rdx)
1360 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, (%rcx)
1361 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, (%r8)
1362 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, (%r9)
1363 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1364 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm8, (%rax)
1365 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1366 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%rax)
1367 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1368 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, (%rax)
1369 ; AVX512DQ-FCP-NEXT: retq
1371 ; AVX512BW-LABEL: load_i16_stride8_vf8:
1372 ; AVX512BW: # %bb.0:
1373 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1374 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
1375 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
1376 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56]
1377 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
1378 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2
1379 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm0
1380 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57]
1381 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3
1382 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58]
1383 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm4
1384 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59]
1385 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm5
1386 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60]
1387 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm6
1388 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61]
1389 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm7
1390 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62]
1391 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm8
1392 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63]
1393 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9
1394 ; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
1395 ; AVX512BW-NEXT: vmovdqa %xmm3, (%rdx)
1396 ; AVX512BW-NEXT: vmovdqa %xmm4, (%rcx)
1397 ; AVX512BW-NEXT: vmovdqa %xmm5, (%r8)
1398 ; AVX512BW-NEXT: vmovdqa %xmm6, (%r9)
1399 ; AVX512BW-NEXT: vmovdqa %xmm7, (%r11)
1400 ; AVX512BW-NEXT: vmovdqa %xmm8, (%r10)
1401 ; AVX512BW-NEXT: vmovdqa %xmm9, (%rax)
1402 ; AVX512BW-NEXT: vzeroupper
1403 ; AVX512BW-NEXT: retq
1405 ; AVX512BW-FCP-LABEL: load_i16_stride8_vf8:
1406 ; AVX512BW-FCP: # %bb.0:
1407 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1408 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1409 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1410 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56]
1411 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
1412 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
1413 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm0
1414 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57]
1415 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm3
1416 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58]
1417 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm4
1418 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59]
1419 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm5
1420 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60]
1421 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm6
1422 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61]
1423 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm7
1424 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62]
1425 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm8
1426 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63]
1427 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm9
1428 ; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%rsi)
1429 ; AVX512BW-FCP-NEXT: vmovdqa %xmm3, (%rdx)
1430 ; AVX512BW-FCP-NEXT: vmovdqa %xmm4, (%rcx)
1431 ; AVX512BW-FCP-NEXT: vmovdqa %xmm5, (%r8)
1432 ; AVX512BW-FCP-NEXT: vmovdqa %xmm6, (%r9)
1433 ; AVX512BW-FCP-NEXT: vmovdqa %xmm7, (%r11)
1434 ; AVX512BW-FCP-NEXT: vmovdqa %xmm8, (%r10)
1435 ; AVX512BW-FCP-NEXT: vmovdqa %xmm9, (%rax)
1436 ; AVX512BW-FCP-NEXT: vzeroupper
1437 ; AVX512BW-FCP-NEXT: retq
1439 ; AVX512DQ-BW-LABEL: load_i16_stride8_vf8:
1440 ; AVX512DQ-BW: # %bb.0:
1441 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
1442 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
1443 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
1444 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56]
1445 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1
1446 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2
1447 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm0
1448 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57]
1449 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3
1450 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58]
1451 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm4
1452 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59]
1453 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm5
1454 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60]
1455 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm6
1456 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61]
1457 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm7
1458 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62]
1459 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm8
1460 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63]
1461 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm9
1462 ; AVX512DQ-BW-NEXT: vmovdqa %xmm0, (%rsi)
1463 ; AVX512DQ-BW-NEXT: vmovdqa %xmm3, (%rdx)
1464 ; AVX512DQ-BW-NEXT: vmovdqa %xmm4, (%rcx)
1465 ; AVX512DQ-BW-NEXT: vmovdqa %xmm5, (%r8)
1466 ; AVX512DQ-BW-NEXT: vmovdqa %xmm6, (%r9)
1467 ; AVX512DQ-BW-NEXT: vmovdqa %xmm7, (%r11)
1468 ; AVX512DQ-BW-NEXT: vmovdqa %xmm8, (%r10)
1469 ; AVX512DQ-BW-NEXT: vmovdqa %xmm9, (%rax)
1470 ; AVX512DQ-BW-NEXT: vzeroupper
1471 ; AVX512DQ-BW-NEXT: retq
1473 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride8_vf8:
1474 ; AVX512DQ-BW-FCP: # %bb.0:
1475 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
1476 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
1477 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
1478 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,8,16,24,32,40,48,56]
1479 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
1480 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
1481 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm0
1482 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm3 = [1,9,17,25,33,41,49,57]
1483 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm3
1484 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm4 = [2,10,18,26,34,42,50,58]
1485 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm4
1486 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [3,11,19,27,35,43,51,59]
1487 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm5
1488 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [4,12,20,28,36,44,52,60]
1489 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm6
1490 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [5,13,21,29,37,45,53,61]
1491 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm7
1492 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [6,14,22,30,38,46,54,62]
1493 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm8
1494 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm9 = [7,15,23,31,39,47,55,63]
1495 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm9
1496 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%rsi)
1497 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm3, (%rdx)
1498 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm4, (%rcx)
1499 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm5, (%r8)
1500 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm6, (%r9)
1501 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm7, (%r11)
1502 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm8, (%r10)
1503 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm9, (%rax)
1504 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
1505 ; AVX512DQ-BW-FCP-NEXT: retq
1506 %wide.vec = load <64 x i16>, ptr %in.vec, align 64
1507 %strided.vec0 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
1508 %strided.vec1 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <8 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57>
1509 %strided.vec2 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <8 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58>
1510 %strided.vec3 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <8 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59>
1511 %strided.vec4 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <8 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60>
1512 %strided.vec5 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <8 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61>
1513 %strided.vec6 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <8 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62>
1514 %strided.vec7 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <8 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63>
1515 store <8 x i16> %strided.vec0, ptr %out.vec0, align 64
1516 store <8 x i16> %strided.vec1, ptr %out.vec1, align 64
1517 store <8 x i16> %strided.vec2, ptr %out.vec2, align 64
1518 store <8 x i16> %strided.vec3, ptr %out.vec3, align 64
1519 store <8 x i16> %strided.vec4, ptr %out.vec4, align 64
1520 store <8 x i16> %strided.vec5, ptr %out.vec5, align 64
1521 store <8 x i16> %strided.vec6, ptr %out.vec6, align 64
1522 store <8 x i16> %strided.vec7, ptr %out.vec7, align 64
1526 define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
1527 ; SSE-LABEL: load_i16_stride8_vf16:
1529 ; SSE-NEXT: subq $168, %rsp
1530 ; SSE-NEXT: movdqa 112(%rdi), %xmm6
1531 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1532 ; SSE-NEXT: movdqa 96(%rdi), %xmm14
1533 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1534 ; SSE-NEXT: movdqa 208(%rdi), %xmm0
1535 ; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
1536 ; SSE-NEXT: movdqa 192(%rdi), %xmm5
1537 ; SSE-NEXT: movdqa 240(%rdi), %xmm1
1538 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1539 ; SSE-NEXT: movdqa 224(%rdi), %xmm12
1540 ; SSE-NEXT: movdqa 144(%rdi), %xmm3
1541 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1542 ; SSE-NEXT: movdqa 128(%rdi), %xmm10
1543 ; SSE-NEXT: movdqa 176(%rdi), %xmm2
1544 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1545 ; SSE-NEXT: movdqa 160(%rdi), %xmm11
1546 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1547 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3]
1548 ; SSE-NEXT: movdqa %xmm10, %xmm2
1549 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
1550 ; SSE-NEXT: movdqa %xmm12, %xmm4
1551 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
1552 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0]
1553 ; SSE-NEXT: movdqa %xmm5, %xmm1
1554 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1555 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
1556 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
1557 ; SSE-NEXT: movdqa %xmm2, %xmm3
1558 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm11[0],xmm3[1],xmm11[1]
1559 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
1560 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1561 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
1562 ; SSE-NEXT: movdqa 80(%rdi), %xmm0
1563 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1564 ; SSE-NEXT: movdqa 64(%rdi), %xmm3
1565 ; SSE-NEXT: movdqa %xmm3, %xmm7
1566 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
1567 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm14[0,0,0,0]
1568 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm7[0,0,0,0]
1569 ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm6[2],xmm15[3],xmm6[3]
1570 ; SSE-NEXT: movdqa 32(%rdi), %xmm9
1571 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1572 ; SSE-NEXT: movdqa 48(%rdi), %xmm0
1573 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1574 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
1575 ; SSE-NEXT: movdqa (%rdi), %xmm6
1576 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
1577 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1578 ; SSE-NEXT: movdqa %xmm6, %xmm8
1579 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
1580 ; SSE-NEXT: movdqa %xmm8, %xmm0
1581 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
1582 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3]
1583 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1584 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
1585 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm11[1,1,1,1]
1586 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1]
1587 ; SSE-NEXT: movdqa %xmm1, %xmm15
1588 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1]
1589 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3]
1590 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1591 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm9[1,1,1,1]
1592 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
1593 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1]
1594 ; SSE-NEXT: movdqa %xmm7, %xmm15
1595 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
1596 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm15[2,3]
1597 ; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1598 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm4[2,2,2,2]
1599 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,2,2]
1600 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3]
1601 ; SSE-NEXT: movdqa %xmm2, %xmm15
1602 ; SSE-NEXT: punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm11[2],xmm15[3],xmm11[3]
1603 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm15[0],xmm0[1]
1604 ; SSE-NEXT: movapd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1605 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm14[2,2,2,2]
1606 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,2,2,2]
1607 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3]
1608 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm8[3,3,3,3]
1609 ; SSE-NEXT: punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm9[2],xmm8[3],xmm9[3]
1610 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm0[2,3]
1611 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
1612 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
1613 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3]
1614 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1615 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
1616 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1617 ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm14[2],xmm7[3],xmm14[3]
1618 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[3,3,3,3]
1619 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
1620 ; SSE-NEXT: shufps {{.*#+}} xmm13 = xmm13[0,1],xmm7[2,3]
1621 ; SSE-NEXT: movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1622 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
1623 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
1624 ; SSE-NEXT: # xmm13 = xmm13[4],mem[4],xmm13[5],mem[5],xmm13[6],mem[6],xmm13[7],mem[7]
1625 ; SSE-NEXT: movdqa %xmm10, %xmm15
1626 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
1627 ; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7]
1628 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
1629 ; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7]
1630 ; SSE-NEXT: movdqa %xmm5, %xmm2
1631 ; SSE-NEXT: punpckhwd (%rsp), %xmm2 # 16-byte Folded Reload
1632 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
1633 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0]
1634 ; SSE-NEXT: movdqa %xmm12, %xmm0
1635 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm2[0,0,0,0]
1636 ; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3]
1637 ; SSE-NEXT: movdqa %xmm15, %xmm1
1638 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1]
1639 ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm1[0],xmm12[1]
1640 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1641 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
1642 ; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7]
1643 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
1644 ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
1645 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0]
1646 ; SSE-NEXT: movdqa %xmm10, %xmm14
1647 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1648 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm3[0,0,0,0]
1649 ; SSE-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
1650 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
1651 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
1652 ; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
1653 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
1654 ; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
1655 ; SSE-NEXT: movdqa %xmm6, %xmm1
1656 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
1657 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
1658 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm15[1,1,1,1]
1659 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm13[1,1,1,1]
1660 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
1661 ; SSE-NEXT: movdqa %xmm2, %xmm7
1662 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
1663 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm7[2,3]
1664 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[1,1,1,1]
1665 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,1,1,1]
1666 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
1667 ; SSE-NEXT: movdqa %xmm3, %xmm9
1668 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
1669 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
1670 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[2,2,2,2]
1671 ; SSE-NEXT: movdqa %xmm0, %xmm10
1672 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[2,2,2,2]
1673 ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
1674 ; SSE-NEXT: movdqa %xmm15, %xmm11
1675 ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm13[2],xmm11[3],xmm13[3]
1676 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm11[0],xmm9[1]
1677 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm14[2,2,2,2]
1678 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
1679 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3]
1680 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm6[3,3,3,3]
1681 ; SSE-NEXT: punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
1682 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm0[2,3]
1683 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm10[2],xmm2[3],xmm10[3]
1684 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[3,3,3,3]
1685 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm13[3,3,3,3]
1686 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
1687 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
1688 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
1689 ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
1690 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[3,3,3,3]
1691 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1]
1692 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm3[2,3]
1693 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1694 ; SSE-NEXT: movaps %xmm2, (%rsi)
1695 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1696 ; SSE-NEXT: movaps %xmm3, 16(%rsi)
1697 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1698 ; SSE-NEXT: movaps %xmm2, (%rdx)
1699 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1700 ; SSE-NEXT: movaps %xmm2, 16(%rdx)
1701 ; SSE-NEXT: movaps %xmm8, (%rcx)
1702 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1703 ; SSE-NEXT: movaps %xmm2, 16(%rcx)
1704 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1705 ; SSE-NEXT: movaps %xmm2, (%r8)
1706 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1707 ; SSE-NEXT: movaps %xmm2, 16(%r8)
1708 ; SSE-NEXT: movaps %xmm1, (%r9)
1709 ; SSE-NEXT: movapd %xmm12, 16(%r9)
1710 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1711 ; SSE-NEXT: movaps %xmm7, (%rax)
1712 ; SSE-NEXT: movaps %xmm4, 16(%rax)
1713 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1714 ; SSE-NEXT: movaps %xmm6, (%rax)
1715 ; SSE-NEXT: movapd %xmm9, 16(%rax)
1716 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
1717 ; SSE-NEXT: movaps %xmm0, 16(%rax)
1718 ; SSE-NEXT: movaps %xmm14, (%rax)
1719 ; SSE-NEXT: addq $168, %rsp
1722 ; AVX-LABEL: load_i16_stride8_vf16:
1724 ; AVX-NEXT: subq $152, %rsp
1725 ; AVX-NEXT: vmovdqa 176(%rdi), %xmm0
1726 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1727 ; AVX-NEXT: vmovdqa 160(%rdi), %xmm1
1728 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1729 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1730 ; AVX-NEXT: vmovdqa 144(%rdi), %xmm0
1731 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1732 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm1
1733 ; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
1734 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1735 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
1736 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1737 ; AVX-NEXT: vmovdqa 240(%rdi), %xmm1
1738 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1739 ; AVX-NEXT: vmovdqa 224(%rdi), %xmm2
1740 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1741 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1742 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,0,0,0]
1743 ; AVX-NEXT: vmovdqa 208(%rdi), %xmm2
1744 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1745 ; AVX-NEXT: vmovdqa 192(%rdi), %xmm3
1746 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1747 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1748 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,1,0,1]
1749 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
1750 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
1751 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1752 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm0
1753 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1754 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm1
1755 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1756 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1757 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[0,0,0,0]
1758 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm1
1759 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1760 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm15
1761 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
1762 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,1]
1763 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm1[0,1,2,3,4,5],xmm0[6,7]
1764 ; AVX-NEXT: vmovdqa (%rdi), %xmm14
1765 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm12
1766 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm11
1767 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm10
1768 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
1769 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3]
1770 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1771 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm13[0,1,2,3],xmm6[4,5,6,7]
1772 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm9[4,5,6,7]
1773 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1774 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
1775 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
1776 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[1,1,1,1]
1777 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7]
1778 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
1779 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
1780 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,1,1]
1781 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,3],xmm9[4,5,6,7]
1782 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
1783 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3],xmm13[4,5,6,7]
1784 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
1785 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1786 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
1787 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
1788 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[2,2,2,2]
1789 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm3[0,1,2,3,4,5],xmm9[6,7]
1790 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
1791 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7]
1792 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm4[2,2,2,2]
1793 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0,1,2,3,4,5],xmm9[6,7]
1794 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1795 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm13[0,1,2,3],xmm9[4,5,6,7]
1796 ; AVX-NEXT: vblendps {{.*#+}} ymm13 = ymm9[0,1,2,3],ymm6[4,5,6,7]
1797 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
1798 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[2,3,2,3]
1799 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3]
1800 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
1801 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
1802 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
1803 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
1804 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
1805 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1806 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
1807 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1808 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1809 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
1810 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1811 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1812 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1813 ; AVX-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
1814 ; AVX-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload
1815 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
1816 ; AVX-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
1817 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1818 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
1819 ; AVX-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
1820 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1821 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
1822 ; AVX-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
1823 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,0,0,0]
1824 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,1,0,1]
1825 ; AVX-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5],xmm4[6,7]
1826 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1827 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
1828 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
1829 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm4[6,7]
1830 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
1831 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
1832 ; AVX-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
1833 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm9 # 16-byte Folded Reload
1834 ; AVX-NEXT: # xmm9 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7]
1835 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,0,0,0]
1836 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm9[0,1,0,1]
1837 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm7[0,1,2,3,4,5],xmm6[6,7]
1838 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
1839 ; AVX-NEXT: vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7]
1840 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
1841 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm10[0,1,2,3],xmm8[4,5,6,7]
1842 ; AVX-NEXT: vblendps {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
1843 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
1844 ; AVX-NEXT: vinsertf128 $1, %xmm8, %ymm0, %ymm8
1845 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[1,1,1,1]
1846 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,3],xmm10[4,5,6,7]
1847 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
1848 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5],ymm8[6,7]
1849 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
1850 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[1,1,1,1]
1851 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0,1],xmm6[2,3],xmm11[4,5,6,7]
1852 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7]
1853 ; AVX-NEXT: vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3],ymm8[4,5,6,7]
1854 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1855 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
1856 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[2,2,2,2]
1857 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0,1,2,3,4,5],xmm11[6,7]
1858 ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
1859 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
1860 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm4[2,2,2,2]
1861 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm9[0,1,2,3,4,5],xmm11[6,7]
1862 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
1863 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4,5,6,7]
1864 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
1865 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1866 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
1867 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
1868 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
1869 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm1
1870 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
1871 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
1872 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm4[2],xmm9[3],xmm4[3]
1873 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[2,3,2,3]
1874 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[3,3,3,3]
1875 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3],xmm3[4,5,6,7]
1876 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1877 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1878 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1879 ; AVX-NEXT: vmovaps %ymm1, (%rsi)
1880 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1881 ; AVX-NEXT: vmovaps %ymm1, (%rdx)
1882 ; AVX-NEXT: vmovaps %ymm13, (%rcx)
1883 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
1884 ; AVX-NEXT: vmovaps %ymm1, (%r8)
1885 ; AVX-NEXT: vmovaps %ymm5, (%r9)
1886 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
1887 ; AVX-NEXT: vmovaps %ymm8, (%rax)
1888 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
1889 ; AVX-NEXT: vmovaps %ymm10, (%rax)
1890 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
1891 ; AVX-NEXT: vmovaps %ymm0, (%rax)
1892 ; AVX-NEXT: addq $152, %rsp
1893 ; AVX-NEXT: vzeroupper
1896 ; AVX2-LABEL: load_i16_stride8_vf16:
1898 ; AVX2-NEXT: subq $264, %rsp # imm = 0x108
1899 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm2
1900 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1901 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm5
1902 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1903 ; AVX2-NEXT: vmovdqa 112(%rdi), %xmm0
1904 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1905 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm1
1906 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1907 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1908 ; AVX2-NEXT: vpbroadcastd %xmm12, %xmm0
1909 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm1
1910 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1911 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm3
1912 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1913 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
1914 ; AVX2-NEXT: vpbroadcastd %xmm9, %xmm1
1915 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1916 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
1917 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1918 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
1919 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1920 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4
1921 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1922 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm6
1923 ; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1924 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
1925 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
1926 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
1927 ; AVX2-NEXT: vpblendd {{.*#+}} xmm14 = xmm4[0,1],xmm3[2,3]
1928 ; AVX2-NEXT: vpermq {{.*#+}} ymm13 = ymm5[0,1,0,2]
1929 ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[0,1,2,0,4,5,6,4]
1930 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
1931 ; AVX2-NEXT: vpermq {{.*#+}} ymm11 = ymm2[0,1,0,2]
1932 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[0,1,2,0,4,5,6,4]
1933 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
1934 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
1935 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm0
1936 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1937 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm1
1938 ; AVX2-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
1939 ; AVX2-NEXT: vpermq {{.*#+}} ymm10 = ymm1[0,1,0,2]
1940 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,2,2,3,4,6,6,7]
1941 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
1942 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,2]
1943 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[0,2,2,3,4,6,6,7]
1944 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
1945 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7]
1946 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
1947 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7]
1948 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1949 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,1,1]
1950 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3]
1951 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm12[0],xmm9[1],xmm12[1]
1952 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
1953 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
1954 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
1955 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
1956 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
1957 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
1958 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
1959 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
1960 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
1961 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1962 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[2,2,2,2]
1963 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
1964 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
1965 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
1966 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[0,1,1,3,4,5,5,7]
1967 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[0,1,1,3,4,5,5,7]
1968 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
1969 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
1970 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
1971 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[3,1,2,3,7,5,6,7]
1972 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,1,2,3,7,5,6,7]
1973 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm6 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
1974 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm10 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
1975 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
1976 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
1977 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
1978 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1979 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm12[2],xmm9[3],xmm12[3]
1980 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[2,3,2,3]
1981 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3]
1982 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3]
1983 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
1984 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
1985 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
1986 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
1987 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
1988 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
1989 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
1990 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
1991 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1992 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1993 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1994 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
1995 ; AVX2-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
1996 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm1
1997 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1998 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload
1999 ; AVX2-NEXT: # xmm5 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
2000 ; AVX2-NEXT: vpbroadcastd %xmm5, %xmm2
2001 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2002 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2003 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2004 ; AVX2-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
2005 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2006 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
2007 ; AVX2-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
2008 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2009 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
2010 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
2011 ; AVX2-NEXT: # ymm6 = mem[0,1,1,3]
2012 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,1,2,0,4,5,6,4]
2013 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
2014 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
2015 ; AVX2-NEXT: # ymm7 = mem[0,1,1,3]
2016 ; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[0,1,2,0,4,5,6,4]
2017 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
2018 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5,6],ymm8[7]
2019 ; AVX2-NEXT: vpermq $212, (%rsp), %ymm8 # 32-byte Folded Reload
2020 ; AVX2-NEXT: # ymm8 = mem[0,1,1,3]
2021 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
2022 ; AVX2-NEXT: # ymm9 = mem[0,1,1,3]
2023 ; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm8[0,2,2,3,4,6,6,7]
2024 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
2025 ; AVX2-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[0,2,2,3,4,6,6,7]
2026 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
2027 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
2028 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2029 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
2030 ; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[1,1,1,1]
2031 ; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm1[1],xmm11[2,3]
2032 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
2033 ; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3]
2034 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
2035 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
2036 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5,6],ymm4[7]
2037 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm10 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
2038 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
2039 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7]
2040 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm4[6,7]
2041 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7]
2042 ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,2,2,2]
2043 ; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm5[0,1,2],xmm10[3]
2044 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2045 ; AVX2-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
2046 ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7]
2047 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm11 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
2048 ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7]
2049 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm12 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
2050 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7]
2051 ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[3,1,2,3,7,5,6,7]
2052 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,1,2,3,7,5,6,7]
2053 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
2054 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm13 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
2055 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7]
2056 ; AVX2-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2057 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
2058 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
2059 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
2060 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
2061 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
2062 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2063 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
2064 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
2065 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
2066 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
2067 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
2068 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7]
2069 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2070 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2071 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2072 ; AVX2-NEXT: vmovaps %ymm1, (%rsi)
2073 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2074 ; AVX2-NEXT: vmovaps %ymm1, (%rdx)
2075 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2076 ; AVX2-NEXT: vmovaps %ymm1, (%rcx)
2077 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2078 ; AVX2-NEXT: vmovaps %ymm1, (%r8)
2079 ; AVX2-NEXT: vmovdqa %ymm3, (%r9)
2080 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
2081 ; AVX2-NEXT: vmovdqa %ymm4, (%rax)
2082 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
2083 ; AVX2-NEXT: vmovdqa %ymm10, (%rax)
2084 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
2085 ; AVX2-NEXT: vmovdqa %ymm0, (%rax)
2086 ; AVX2-NEXT: addq $264, %rsp # imm = 0x108
2087 ; AVX2-NEXT: vzeroupper
2090 ; AVX2-FP-LABEL: load_i16_stride8_vf16:
2092 ; AVX2-FP-NEXT: subq $264, %rsp # imm = 0x108
2093 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm2
2094 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2095 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm5
2096 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2097 ; AVX2-FP-NEXT: vmovdqa 112(%rdi), %xmm0
2098 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2099 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm1
2100 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2101 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2102 ; AVX2-FP-NEXT: vpbroadcastd %xmm12, %xmm0
2103 ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm1
2104 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2105 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm3
2106 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2107 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
2108 ; AVX2-FP-NEXT: vpbroadcastd %xmm9, %xmm1
2109 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2110 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
2111 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2112 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
2113 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2114 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm4
2115 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2116 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm6
2117 ; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2118 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
2119 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2120 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
2121 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm14 = xmm4[0,1],xmm3[2,3]
2122 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm13 = ymm5[0,1,0,2]
2123 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[0,1,2,0,4,5,6,4]
2124 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
2125 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm2[0,1,0,2]
2126 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[0,1,2,0,4,5,6,4]
2127 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
2128 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
2129 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm0
2130 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2131 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm1
2132 ; AVX2-FP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
2133 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm1[0,1,0,2]
2134 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,2,2,3,4,6,6,7]
2135 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
2136 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,2]
2137 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[0,2,2,3,4,6,6,7]
2138 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
2139 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7]
2140 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
2141 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7]
2142 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2143 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,1,1]
2144 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3]
2145 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm12[0],xmm9[1],xmm12[1]
2146 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
2147 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
2148 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
2149 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
2150 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
2151 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
2152 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
2153 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
2154 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
2155 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2156 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[2,2,2,2]
2157 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
2158 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
2159 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2160 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[0,1,1,3,4,5,5,7]
2161 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[0,1,1,3,4,5,5,7]
2162 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
2163 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
2164 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
2165 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[3,1,2,3,7,5,6,7]
2166 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,1,2,3,7,5,6,7]
2167 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
2168 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
2169 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
2170 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
2171 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
2172 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2173 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm12[2],xmm9[3],xmm12[3]
2174 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[2,3,2,3]
2175 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3]
2176 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3]
2177 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
2178 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
2179 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
2180 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
2181 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
2182 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
2183 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
2184 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2185 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2186 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2187 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2188 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2189 ; AVX2-FP-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
2190 ; AVX2-FP-NEXT: vpbroadcastd %xmm0, %xmm1
2191 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2192 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload
2193 ; AVX2-FP-NEXT: # xmm5 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
2194 ; AVX2-FP-NEXT: vpbroadcastd %xmm5, %xmm2
2195 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2196 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2197 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2198 ; AVX2-FP-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
2199 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2200 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
2201 ; AVX2-FP-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
2202 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2203 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
2204 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
2205 ; AVX2-FP-NEXT: # ymm6 = mem[0,1,1,3]
2206 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,1,2,0,4,5,6,4]
2207 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
2208 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
2209 ; AVX2-FP-NEXT: # ymm7 = mem[0,1,1,3]
2210 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[0,1,2,0,4,5,6,4]
2211 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
2212 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5,6],ymm8[7]
2213 ; AVX2-FP-NEXT: vpermq $212, (%rsp), %ymm8 # 32-byte Folded Reload
2214 ; AVX2-FP-NEXT: # ymm8 = mem[0,1,1,3]
2215 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
2216 ; AVX2-FP-NEXT: # ymm9 = mem[0,1,1,3]
2217 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm8[0,2,2,3,4,6,6,7]
2218 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
2219 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[0,2,2,3,4,6,6,7]
2220 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
2221 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
2222 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2223 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
2224 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[1,1,1,1]
2225 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm1[1],xmm11[2,3]
2226 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
2227 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3]
2228 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
2229 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
2230 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5,6],ymm4[7]
2231 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
2232 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
2233 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7]
2234 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm4[6,7]
2235 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7]
2236 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,2,2,2]
2237 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm5[0,1,2],xmm10[3]
2238 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2239 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
2240 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7]
2241 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
2242 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7]
2243 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
2244 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7]
2245 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[3,1,2,3,7,5,6,7]
2246 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,1,2,3,7,5,6,7]
2247 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
2248 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
2249 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7]
2250 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2251 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
2252 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
2253 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
2254 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
2255 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
2256 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2257 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
2258 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
2259 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
2260 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
2261 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
2262 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7]
2263 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2264 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2265 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2266 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi)
2267 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2268 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx)
2269 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2270 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rcx)
2271 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2272 ; AVX2-FP-NEXT: vmovaps %ymm1, (%r8)
2273 ; AVX2-FP-NEXT: vmovdqa %ymm3, (%r9)
2274 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2275 ; AVX2-FP-NEXT: vmovdqa %ymm4, (%rax)
2276 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2277 ; AVX2-FP-NEXT: vmovdqa %ymm10, (%rax)
2278 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2279 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rax)
2280 ; AVX2-FP-NEXT: addq $264, %rsp # imm = 0x108
2281 ; AVX2-FP-NEXT: vzeroupper
2282 ; AVX2-FP-NEXT: retq
2284 ; AVX2-FCP-LABEL: load_i16_stride8_vf16:
2285 ; AVX2-FCP: # %bb.0:
2286 ; AVX2-FCP-NEXT: subq $264, %rsp # imm = 0x108
2287 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm2
2288 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2289 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm5
2290 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2291 ; AVX2-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
2292 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2293 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
2294 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2295 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2296 ; AVX2-FCP-NEXT: vpbroadcastd %xmm12, %xmm0
2297 ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm1
2298 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2299 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm3
2300 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2301 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
2302 ; AVX2-FCP-NEXT: vpbroadcastd %xmm9, %xmm1
2303 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2304 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
2305 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2306 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
2307 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2308 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm4
2309 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2310 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm6
2311 ; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2312 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
2313 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2314 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
2315 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm14 = xmm4[0,1],xmm3[2,3]
2316 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm5[0,1,0,2]
2317 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm13[0,1,2,0,4,5,6,4]
2318 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
2319 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm2[0,1,0,2]
2320 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm11[0,1,2,0,4,5,6,4]
2321 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
2322 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
2323 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm0
2324 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2325 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1
2326 ; AVX2-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
2327 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm1[0,1,0,2]
2328 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm10[0,2,2,3,4,6,6,7]
2329 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
2330 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,1,0,2]
2331 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm4[0,2,2,3,4,6,6,7]
2332 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
2333 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm15[0,1,2,3,4],ymm2[5],ymm15[6,7]
2334 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
2335 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm14[0,1,2,3],ymm2[4,5,6,7]
2336 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2337 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm7[1,1,1,1]
2338 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3]
2339 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm12[0],xmm9[1],xmm12[1]
2340 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
2341 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
2342 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
2343 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
2344 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
2345 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
2346 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
2347 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
2348 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
2349 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2350 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[2,2,2,2]
2351 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
2352 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm7[2],xmm8[2],xmm7[3],xmm8[3]
2353 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2354 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm13[0,1,1,3,4,5,5,7]
2355 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm11[0,1,1,3,4,5,5,7]
2356 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
2357 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
2358 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
2359 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm10[3,1,2,3,7,5,6,7]
2360 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[3,1,2,3,7,5,6,7]
2361 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
2362 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
2363 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
2364 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
2365 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
2366 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2367 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm12[2],xmm9[3],xmm12[3]
2368 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[2,3,2,3]
2369 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm7[3,3,3,3]
2370 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3]
2371 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
2372 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
2373 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
2374 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
2375 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
2376 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
2377 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
2378 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2379 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2380 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2381 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2382 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
2383 ; AVX2-FCP-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
2384 ; AVX2-FCP-NEXT: vpbroadcastd %xmm0, %xmm1
2385 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2386 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload
2387 ; AVX2-FCP-NEXT: # xmm5 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
2388 ; AVX2-FCP-NEXT: vpbroadcastd %xmm5, %xmm2
2389 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2390 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2391 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2392 ; AVX2-FCP-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
2393 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2394 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
2395 ; AVX2-FCP-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
2396 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2397 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
2398 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
2399 ; AVX2-FCP-NEXT: # ymm6 = mem[0,1,1,3]
2400 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm6[0,1,2,0,4,5,6,4]
2401 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
2402 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
2403 ; AVX2-FCP-NEXT: # ymm7 = mem[0,1,1,3]
2404 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm7[0,1,2,0,4,5,6,4]
2405 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
2406 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm9[0,1,2,3,4,5,6],ymm8[7]
2407 ; AVX2-FCP-NEXT: vpermq $212, (%rsp), %ymm8 # 32-byte Folded Reload
2408 ; AVX2-FCP-NEXT: # ymm8 = mem[0,1,1,3]
2409 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
2410 ; AVX2-FCP-NEXT: # ymm9 = mem[0,1,1,3]
2411 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm8[0,2,2,3,4,6,6,7]
2412 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
2413 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm9[0,2,2,3,4,6,6,7]
2414 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
2415 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
2416 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2417 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
2418 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[1,1,1,1]
2419 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0],xmm1[1],xmm11[2,3]
2420 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
2421 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm11[0,1],xmm12[2,3]
2422 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
2423 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
2424 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5,6],ymm4[7]
2425 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
2426 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
2427 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7]
2428 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm10[0,1,2,3,4,5],ymm4[6,7]
2429 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7]
2430 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm0[2,2,2,2]
2431 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm5[0,1,2],xmm10[3]
2432 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2433 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
2434 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm6[0,1,1,3,4,5,5,7]
2435 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
2436 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7]
2437 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
2438 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6],ymm11[7]
2439 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[3,1,2,3,7,5,6,7]
2440 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[3,1,2,3,7,5,6,7]
2441 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
2442 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
2443 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7]
2444 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
2445 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
2446 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
2447 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
2448 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
2449 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
2450 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2451 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
2452 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
2453 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
2454 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
2455 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
2456 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4],ymm2[5],ymm5[6,7]
2457 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2458 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2459 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2460 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi)
2461 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2462 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx)
2463 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2464 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rcx)
2465 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
2466 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%r8)
2467 ; AVX2-FCP-NEXT: vmovdqa %ymm3, (%r9)
2468 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2469 ; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rax)
2470 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2471 ; AVX2-FCP-NEXT: vmovdqa %ymm10, (%rax)
2472 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2473 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax)
2474 ; AVX2-FCP-NEXT: addq $264, %rsp # imm = 0x108
2475 ; AVX2-FCP-NEXT: vzeroupper
2476 ; AVX2-FCP-NEXT: retq
2478 ; AVX512-LABEL: load_i16_stride8_vf16:
2480 ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm0
2481 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm1
2482 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2483 ; AVX512-NEXT: vmovdqa64 %xmm1, %xmm24
2484 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm25
2485 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm0
2486 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm1
2487 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2488 ; AVX512-NEXT: vmovdqa64 %xmm1, %xmm26
2489 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm27
2490 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4]
2491 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm29 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
2492 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
2493 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
2494 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm30
2495 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
2496 ; AVX512-NEXT: vpermt2d %xmm5, %xmm7, %xmm6
2497 ; AVX512-NEXT: vmovdqa (%rdi), %xmm9
2498 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm10
2499 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm12
2500 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm13
2501 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
2502 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
2503 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm14[0],xmm4[0],xmm14[1],xmm4[1]
2504 ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
2505 ; AVX512-NEXT: vmovdqa64 224(%rdi), %ymm18
2506 ; AVX512-NEXT: vpermq {{.*#+}} ymm17 = ymm18[0,1,0,2]
2507 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[0,1,2,0,4,5,6,4]
2508 ; AVX512-NEXT: vmovdqa64 192(%rdi), %ymm20
2509 ; AVX512-NEXT: vpermq {{.*#+}} ymm19 = ymm20[0,1,0,2]
2510 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
2511 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[0,1,2,0,4,5,6,4]
2512 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
2513 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7]
2514 ; AVX512-NEXT: vmovdqa64 160(%rdi), %ymm23
2515 ; AVX512-NEXT: vpermq {{.*#+}} ymm21 = ymm23[0,1,0,2]
2516 ; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm16
2517 ; AVX512-NEXT: vpermq {{.*#+}} ymm22 = ymm16[0,1,0,2]
2518 ; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm21[0,2,2,3,4,6,6,7]
2519 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
2520 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[0,2,2,3,4,6,6,7]
2521 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
2522 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5],ymm8[6,7]
2523 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
2524 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
2525 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm28
2526 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
2527 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
2528 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
2529 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
2530 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
2531 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
2532 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
2533 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,1,1]
2534 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
2535 ; AVX512-NEXT: vmovdqa64 %xmm29, %xmm2
2536 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
2537 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2538 ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm29
2539 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,1,1,3,4,5,5,7]
2540 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
2541 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,1,1,3,4,5,5,7]
2542 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
2543 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
2544 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[3,1,2,3,7,5,6,7]
2545 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
2546 ; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm22[3,1,2,3,7,5,6,7]
2547 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
2548 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7]
2549 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
2550 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm14[2],xmm4[2],xmm14[3],xmm4[3]
2551 ; AVX512-NEXT: vmovdqa64 %xmm30, %xmm6
2552 ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
2553 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
2554 ; AVX512-NEXT: vmovdqa64 %ymm1, %ymm19
2555 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0]
2556 ; AVX512-NEXT: vpermt2d %xmm4, %xmm17, %xmm14
2557 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm11[2,3]
2558 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
2559 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
2560 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
2561 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
2562 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
2563 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
2564 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
2565 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2566 ; AVX512-NEXT: vmovdqa64 %xmm24, %xmm0
2567 ; AVX512-NEXT: vmovdqa64 %xmm25, %xmm1
2568 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2569 ; AVX512-NEXT: vmovdqa64 %xmm26, %xmm0
2570 ; AVX512-NEXT: vmovdqa64 %xmm27, %xmm2
2571 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2572 ; AVX512-NEXT: vpermi2d %xmm1, %xmm2, %xmm7
2573 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
2574 ; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
2575 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2576 ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm7[2,3]
2577 ; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm18[0,1,1,3]
2578 ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[0,1,2,0,4,5,6,4]
2579 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm10 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
2580 ; AVX512-NEXT: vpermq {{.*#+}} ymm11 = ymm20[0,1,1,3]
2581 ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm11[0,1,2,0,4,5,6,4]
2582 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
2583 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5,6],ymm10[7]
2584 ; AVX512-NEXT: vpermq {{.*#+}} ymm13 = ymm23[0,1,1,3]
2585 ; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,2,2,3,4,6,6,7]
2586 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
2587 ; AVX512-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,1,1,3]
2588 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm16[0,2,2,3,4,6,6,7]
2589 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
2590 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5],ymm6[6,7]
2591 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7]
2592 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm6[4,5,6,7]
2593 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
2594 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
2595 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
2596 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
2597 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
2598 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
2599 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
2600 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2601 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
2602 ; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
2603 ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
2604 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm0[4,5,6,7]
2605 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,1,1,3,4,5,5,7]
2606 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
2607 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[0,1,1,3,4,5,5,7]
2608 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
2609 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7]
2610 ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[3,1,2,3,7,5,6,7]
2611 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm9[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
2612 ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm16[3,1,2,3,7,5,6,7]
2613 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
2614 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7]
2615 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7]
2616 ; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,2,2,2]
2617 ; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3]
2618 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2619 ; AVX512-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
2620 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7]
2621 ; AVX512-NEXT: vpermt2d %xmm4, %xmm17, %xmm3
2622 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2623 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
2624 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
2625 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
2626 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
2627 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
2628 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
2629 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
2630 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
2631 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2632 ; AVX512-NEXT: vmovdqa64 %ymm28, (%rsi)
2633 ; AVX512-NEXT: vmovdqa64 %ymm29, (%rdx)
2634 ; AVX512-NEXT: vmovdqa64 %ymm19, (%rcx)
2635 ; AVX512-NEXT: vmovdqa %ymm8, (%r8)
2636 ; AVX512-NEXT: vmovdqa %ymm10, (%r9)
2637 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
2638 ; AVX512-NEXT: vmovdqa %ymm7, (%rax)
2639 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
2640 ; AVX512-NEXT: vmovdqa %ymm5, (%rax)
2641 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
2642 ; AVX512-NEXT: vmovdqa %ymm0, (%rax)
2643 ; AVX512-NEXT: vzeroupper
2646 ; AVX512-FCP-LABEL: load_i16_stride8_vf16:
2647 ; AVX512-FCP: # %bb.0:
2648 ; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
2649 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
2650 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2651 ; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm26
2652 ; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm27
2653 ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm0
2654 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
2655 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2656 ; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm28
2657 ; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm29
2658 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4]
2659 ; AVX512-FCP-NEXT: vmovdqa %xmm12, %xmm13
2660 ; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm13
2661 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
2662 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
2663 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm9
2664 ; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm10
2665 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
2666 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
2667 ; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm30
2668 ; AVX512-FCP-NEXT: vmovdqa64 %xmm0, %xmm31
2669 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm11[0],xmm5[0],xmm11[1],xmm5[1]
2670 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm14[0,1],xmm13[2,3]
2671 ; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm16
2672 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2]
2673 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm18[0,1,2,0,4,5,6,4]
2674 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
2675 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17
2676 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm20 = ymm17[0,1,0,2]
2677 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,1,2,0,4,5,6,4]
2678 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
2679 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm14[7]
2680 ; AVX512-FCP-NEXT: vmovdqa64 160(%rdi), %ymm19
2681 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm21 = ymm19[0,1,0,2]
2682 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23
2683 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm22 = ymm23[0,1,0,2]
2684 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm21[0,2,2,3,4,6,6,7]
2685 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
2686 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[0,2,2,3,4,6,6,7]
2687 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
2688 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7]
2689 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
2690 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7]
2691 ; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm24
2692 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0]
2693 ; AVX512-FCP-NEXT: vmovdqa %xmm11, %xmm1
2694 ; AVX512-FCP-NEXT: vpermt2d %xmm5, %xmm15, %xmm1
2695 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
2696 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
2697 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
2698 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
2699 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
2700 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
2701 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
2702 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
2703 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
2704 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2705 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm25
2706 ; AVX512-FCP-NEXT: vmovdqa %xmm12, %xmm0
2707 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm4[2],xmm12[3],xmm4[3]
2708 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [0,0,2,6]
2709 ; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm12, %xmm0
2710 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm5[2],xmm11[3],xmm5[3]
2711 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
2712 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,1,1,3,4,5,5,7]
2713 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
2714 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[0,1,1,3,4,5,5,7]
2715 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
2716 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7]
2717 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm21[3,1,2,3,7,5,6,7]
2718 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
2719 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm22[3,1,2,3,7,5,6,7]
2720 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
2721 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7]
2722 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
2723 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2724 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm21
2725 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [3,7,0,0]
2726 ; AVX512-FCP-NEXT: vpermt2d %xmm5, %xmm18, %xmm11
2727 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm2[2,3]
2728 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
2729 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
2730 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
2731 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
2732 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
2733 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
2734 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
2735 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
2736 ; AVX512-FCP-NEXT: vmovdqa64 %ymm0, %ymm22
2737 ; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm0
2738 ; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm1
2739 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2740 ; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm0
2741 ; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm1
2742 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2743 ; AVX512-FCP-NEXT: vpermi2d %xmm3, %xmm4, %xmm7
2744 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
2745 ; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm0
2746 ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm1
2747 ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2748 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
2749 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3]
2750 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm20 = ymm16[0,1,1,3]
2751 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm20[0,1,2,0,4,5,6,4]
2752 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
2753 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm17[0,1,1,3]
2754 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,4]
2755 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
2756 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5,6],ymm9[7]
2757 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm19[0,1,1,3]
2758 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,2,2,3,4,6,6,7]
2759 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
2760 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm23[0,1,1,3]
2761 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[0,2,2,3,4,6,6,7]
2762 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
2763 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
2764 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7]
2765 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
2766 ; AVX512-FCP-NEXT: vpermi2d %xmm6, %xmm5, %xmm15
2767 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2768 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
2769 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
2770 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
2771 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm1[7]
2772 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
2773 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
2774 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
2775 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
2776 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2777 ; AVX512-FCP-NEXT: vpermi2d %xmm3, %xmm4, %xmm12
2778 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
2779 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3]
2780 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,1,1,3,4,5,5,7]
2781 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
2782 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm10[0,1,1,3,4,5,5,7]
2783 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
2784 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5,6],ymm8[7]
2785 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[3,1,2,3,7,5,6,7]
2786 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
2787 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm16[3,1,2,3,7,5,6,7]
2788 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
2789 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7]
2790 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7]
2791 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
2792 ; AVX512-FCP-NEXT: vpermt2d %xmm6, %xmm18, %xmm5
2793 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
2794 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
2795 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
2796 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
2797 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
2798 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
2799 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
2800 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
2801 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
2802 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
2803 ; AVX512-FCP-NEXT: vmovdqa64 %ymm24, (%rsi)
2804 ; AVX512-FCP-NEXT: vmovdqa64 %ymm25, (%rdx)
2805 ; AVX512-FCP-NEXT: vmovdqa64 %ymm21, (%rcx)
2806 ; AVX512-FCP-NEXT: vmovdqa64 %ymm22, (%r8)
2807 ; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r9)
2808 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2809 ; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rax)
2810 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2811 ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rax)
2812 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
2813 ; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rax)
2814 ; AVX512-FCP-NEXT: vzeroupper
2815 ; AVX512-FCP-NEXT: retq
2817 ; AVX512DQ-LABEL: load_i16_stride8_vf16:
2818 ; AVX512DQ: # %bb.0:
2819 ; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm0
2820 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm1
2821 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2822 ; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm24
2823 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm25
2824 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm0
2825 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm1
2826 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2827 ; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm26
2828 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm27
2829 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4]
2830 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm29 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
2831 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
2832 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
2833 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm30
2834 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm11 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
2835 ; AVX512DQ-NEXT: vpermt2d %xmm5, %xmm7, %xmm6
2836 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm9
2837 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm10
2838 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm12
2839 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm13
2840 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
2841 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
2842 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm14[0],xmm4[0],xmm14[1],xmm4[1]
2843 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
2844 ; AVX512DQ-NEXT: vmovdqa64 224(%rdi), %ymm18
2845 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm17 = ymm18[0,1,0,2]
2846 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm17[0,1,2,0,4,5,6,4]
2847 ; AVX512DQ-NEXT: vmovdqa64 192(%rdi), %ymm20
2848 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm19 = ymm20[0,1,0,2]
2849 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
2850 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm19[0,1,2,0,4,5,6,4]
2851 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
2852 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm15[7]
2853 ; AVX512DQ-NEXT: vmovdqa64 160(%rdi), %ymm23
2854 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm21 = ymm23[0,1,0,2]
2855 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm16
2856 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm22 = ymm16[0,1,0,2]
2857 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm21[0,2,2,3,4,6,6,7]
2858 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
2859 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[0,2,2,3,4,6,6,7]
2860 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
2861 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5],ymm8[6,7]
2862 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
2863 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
2864 ; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm28
2865 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
2866 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
2867 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
2868 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
2869 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
2870 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
2871 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
2872 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[1,1,1,1]
2873 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3]
2874 ; AVX512DQ-NEXT: vmovdqa64 %xmm29, %xmm2
2875 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
2876 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2877 ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm29
2878 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,1,1,3,4,5,5,7]
2879 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
2880 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,1,1,3,4,5,5,7]
2881 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
2882 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
2883 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[3,1,2,3,7,5,6,7]
2884 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
2885 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm22[3,1,2,3,7,5,6,7]
2886 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
2887 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm15[0,1,2,3,4],ymm5[5],ymm15[6,7]
2888 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
2889 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm14[2],xmm4[2],xmm14[3],xmm4[3]
2890 ; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm6
2891 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
2892 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
2893 ; AVX512DQ-NEXT: vmovdqa64 %ymm1, %ymm19
2894 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0]
2895 ; AVX512DQ-NEXT: vpermt2d %xmm4, %xmm17, %xmm14
2896 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm11[2,3]
2897 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
2898 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
2899 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
2900 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
2901 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
2902 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
2903 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
2904 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2905 ; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm0
2906 ; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm1
2907 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
2908 ; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm0
2909 ; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm2
2910 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
2911 ; AVX512DQ-NEXT: vpermi2d %xmm1, %xmm2, %xmm7
2912 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm12[4],xmm13[4],xmm12[5],xmm13[5],xmm12[6],xmm13[6],xmm12[7],xmm13[7]
2913 ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
2914 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2915 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm0[0,1],xmm7[2,3]
2916 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm18[0,1,1,3]
2917 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm9[0,1,2,0,4,5,6,4]
2918 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm10 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
2919 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm11 = ymm20[0,1,1,3]
2920 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm11[0,1,2,0,4,5,6,4]
2921 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
2922 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm13[0,1,2,3,4,5,6],ymm10[7]
2923 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm13 = ymm23[0,1,1,3]
2924 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,2,2,3,4,6,6,7]
2925 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
2926 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm16 = ymm16[0,1,1,3]
2927 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm16[0,2,2,3,4,6,6,7]
2928 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
2929 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm15[5],ymm6[6,7]
2930 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7]
2931 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm5[0,1,2,3],ymm6[4,5,6,7]
2932 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
2933 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
2934 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
2935 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
2936 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
2937 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
2938 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm5[6,7]
2939 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2940 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
2941 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
2942 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
2943 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm5[0,1,2,3],ymm0[4,5,6,7]
2944 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm9[0,1,1,3,4,5,5,7]
2945 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
2946 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm11[0,1,1,3,4,5,5,7]
2947 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
2948 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7]
2949 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm13[3,1,2,3,7,5,6,7]
2950 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm9[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
2951 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm16[3,1,2,3,7,5,6,7]
2952 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
2953 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7]
2954 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3,4,5],ymm5[6,7]
2955 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[2,2,2,2]
2956 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm2[0,1,2],xmm11[3]
2957 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2958 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm11 = xmm13[0,1],xmm11[2,3]
2959 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm11[0,1,2,3],ymm5[4,5,6,7]
2960 ; AVX512DQ-NEXT: vpermt2d %xmm4, %xmm17, %xmm3
2961 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
2962 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
2963 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
2964 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
2965 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
2966 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
2967 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
2968 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
2969 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
2970 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2971 ; AVX512DQ-NEXT: vmovdqa64 %ymm28, (%rsi)
2972 ; AVX512DQ-NEXT: vmovdqa64 %ymm29, (%rdx)
2973 ; AVX512DQ-NEXT: vmovdqa64 %ymm19, (%rcx)
2974 ; AVX512DQ-NEXT: vmovdqa %ymm8, (%r8)
2975 ; AVX512DQ-NEXT: vmovdqa %ymm10, (%r9)
2976 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
2977 ; AVX512DQ-NEXT: vmovdqa %ymm7, (%rax)
2978 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
2979 ; AVX512DQ-NEXT: vmovdqa %ymm5, (%rax)
2980 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
2981 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rax)
2982 ; AVX512DQ-NEXT: vzeroupper
2983 ; AVX512DQ-NEXT: retq
2985 ; AVX512DQ-FCP-LABEL: load_i16_stride8_vf16:
2986 ; AVX512DQ-FCP: # %bb.0:
2987 ; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
2988 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
2989 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2990 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm26
2991 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm27
2992 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm0
2993 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
2994 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2995 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm28
2996 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm29
2997 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm7 = [0,0,0,4]
2998 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, %xmm13
2999 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm7, %xmm13
3000 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
3001 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm1
3002 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm9
3003 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm10
3004 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
3005 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3006 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm30
3007 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm0, %xmm31
3008 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm11[0],xmm5[0],xmm11[1],xmm5[1]
3009 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm15 = xmm14[0,1],xmm13[2,3]
3010 ; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm16
3011 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm18 = ymm16[0,1,0,2]
3012 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm18[0,1,2,0,4,5,6,4]
3013 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
3014 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17
3015 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm20 = ymm17[0,1,0,2]
3016 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm20[0,1,2,0,4,5,6,4]
3017 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
3018 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm14[7]
3019 ; AVX512DQ-FCP-NEXT: vmovdqa64 160(%rdi), %ymm19
3020 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm21 = ymm19[0,1,0,2]
3021 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %ymm23
3022 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm22 = ymm23[0,1,0,2]
3023 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm21[0,2,2,3,4,6,6,7]
3024 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
3025 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm22[0,2,2,3,4,6,6,7]
3026 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
3027 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5],ymm6[6,7]
3028 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
3029 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7]
3030 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm24
3031 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0]
3032 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm11, %xmm1
3033 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm5, %xmm15, %xmm1
3034 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
3035 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
3036 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
3037 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
3038 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
3039 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
3040 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
3041 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
3042 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
3043 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
3044 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm25
3045 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, %xmm0
3046 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm4[2],xmm12[3],xmm4[3]
3047 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm12 = [0,0,2,6]
3048 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm12, %xmm0
3049 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm11[2],xmm5[2],xmm11[3],xmm5[3]
3050 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
3051 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,1,1,3,4,5,5,7]
3052 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
3053 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[0,1,1,3,4,5,5,7]
3054 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
3055 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7]
3056 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm21[3,1,2,3,7,5,6,7]
3057 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
3058 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm22[3,1,2,3,7,5,6,7]
3059 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
3060 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm13[5],ymm8[6,7]
3061 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
3062 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3063 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm21
3064 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm18 = [3,7,0,0]
3065 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm5, %xmm18, %xmm11
3066 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm2[2,3]
3067 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
3068 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
3069 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
3070 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
3071 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
3072 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
3073 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
3074 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
3075 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm0, %ymm22
3076 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm0
3077 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm1
3078 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3079 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm0
3080 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm1
3081 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
3082 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm3, %xmm4, %xmm7
3083 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
3084 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm0
3085 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm1
3086 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
3087 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
3088 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm7[2,3]
3089 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm20 = ymm16[0,1,1,3]
3090 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm20[0,1,2,0,4,5,6,4]
3091 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
3092 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm17[0,1,1,3]
3093 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,4]
3094 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
3095 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5,6],ymm9[7]
3096 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm13 = ymm19[0,1,1,3]
3097 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm13[0,2,2,3,4,6,6,7]
3098 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
3099 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm16 = ymm23[0,1,1,3]
3100 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm16[0,2,2,3,4,6,6,7]
3101 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
3102 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4],ymm0[5],ymm1[6,7]
3103 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7]
3104 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
3105 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm6, %xmm5, %xmm15
3106 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
3107 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm0[2,3]
3108 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
3109 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
3110 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6],ymm1[7]
3111 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
3112 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
3113 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5],ymm8[6,7]
3114 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5],ymm1[6,7]
3115 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3116 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm3, %xmm4, %xmm12
3117 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm5[2],xmm6[2],xmm5[3],xmm6[3]
3118 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3]
3119 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,1,1,3,4,5,5,7]
3120 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
3121 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm10[0,1,1,3,4,5,5,7]
3122 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
3123 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm10[0,1,2,3,4,5,6],ymm8[7]
3124 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm13[3,1,2,3,7,5,6,7]
3125 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
3126 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm16[3,1,2,3,7,5,6,7]
3127 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
3128 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4],ymm11[5],ymm13[6,7]
3129 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm11[0,1,2,3,4,5],ymm8[6,7]
3130 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm8[4,5,6,7]
3131 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm6, %xmm18, %xmm5
3132 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
3133 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
3134 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
3135 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
3136 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
3137 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
3138 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm12[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
3139 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
3140 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
3141 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
3142 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm24, (%rsi)
3143 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm25, (%rdx)
3144 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm21, (%rcx)
3145 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm22, (%r8)
3146 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r9)
3147 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3148 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rax)
3149 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3150 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rax)
3151 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3152 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rax)
3153 ; AVX512DQ-FCP-NEXT: vzeroupper
3154 ; AVX512DQ-FCP-NEXT: retq
3156 ; AVX512BW-LABEL: load_i16_stride8_vf16:
3157 ; AVX512BW: # %bb.0:
3158 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
3159 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
3160 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
3161 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3162 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
3163 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2
3164 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3
3165 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
3166 ; AVX512BW-NEXT: # ymm4 = mem[0,1,0,1]
3167 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4
3168 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56]
3169 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
3170 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
3171 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
3172 ; AVX512BW-NEXT: # ymm5 = mem[0,1,0,1]
3173 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5
3174 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57]
3175 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
3176 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
3177 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
3178 ; AVX512BW-NEXT: # ymm6 = mem[0,1,0,1]
3179 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
3180 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58]
3181 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
3182 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
3183 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
3184 ; AVX512BW-NEXT: # ymm7 = mem[0,1,0,1]
3185 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7
3186 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59]
3187 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
3188 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
3189 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
3190 ; AVX512BW-NEXT: # ymm8 = mem[0,1,0,1]
3191 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8
3192 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60]
3193 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9
3194 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
3195 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
3196 ; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1]
3197 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9
3198 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61]
3199 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
3200 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
3201 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
3202 ; AVX512BW-NEXT: # ymm10 = mem[0,1,0,1]
3203 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10
3204 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62]
3205 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm11
3206 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
3207 ; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
3208 ; AVX512BW-NEXT: # ymm11 = mem[0,1,0,1]
3209 ; AVX512BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm11
3210 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63]
3211 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
3212 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
3213 ; AVX512BW-NEXT: vmovdqa %ymm4, (%rsi)
3214 ; AVX512BW-NEXT: vmovdqa %ymm5, (%rdx)
3215 ; AVX512BW-NEXT: vmovdqa %ymm6, (%rcx)
3216 ; AVX512BW-NEXT: vmovdqa %ymm7, (%r8)
3217 ; AVX512BW-NEXT: vmovdqa %ymm8, (%r9)
3218 ; AVX512BW-NEXT: vmovdqa %ymm9, (%r11)
3219 ; AVX512BW-NEXT: vmovdqa %ymm10, (%r10)
3220 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rax)
3221 ; AVX512BW-NEXT: vzeroupper
3222 ; AVX512BW-NEXT: retq
3224 ; AVX512BW-FCP-LABEL: load_i16_stride8_vf16:
3225 ; AVX512BW-FCP: # %bb.0:
3226 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3227 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
3228 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
3229 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
3230 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
3231 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
3232 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
3233 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
3234 ; AVX512BW-FCP-NEXT: # ymm4 = mem[0,1,0,1]
3235 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4
3236 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56]
3237 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
3238 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
3239 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
3240 ; AVX512BW-FCP-NEXT: # ymm5 = mem[0,1,0,1]
3241 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5
3242 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57]
3243 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
3244 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
3245 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
3246 ; AVX512BW-FCP-NEXT: # ymm6 = mem[0,1,0,1]
3247 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
3248 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58]
3249 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
3250 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
3251 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
3252 ; AVX512BW-FCP-NEXT: # ymm7 = mem[0,1,0,1]
3253 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7
3254 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59]
3255 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
3256 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
3257 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
3258 ; AVX512BW-FCP-NEXT: # ymm8 = mem[0,1,0,1]
3259 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8
3260 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60]
3261 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9
3262 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
3263 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
3264 ; AVX512BW-FCP-NEXT: # ymm9 = mem[0,1,0,1]
3265 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9
3266 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61]
3267 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
3268 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
3269 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
3270 ; AVX512BW-FCP-NEXT: # ymm10 = mem[0,1,0,1]
3271 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10
3272 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62]
3273 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm11
3274 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
3275 ; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
3276 ; AVX512BW-FCP-NEXT: # ymm11 = mem[0,1,0,1]
3277 ; AVX512BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm11
3278 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63]
3279 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
3280 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
3281 ; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rsi)
3282 ; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%rdx)
3283 ; AVX512BW-FCP-NEXT: vmovdqa %ymm6, (%rcx)
3284 ; AVX512BW-FCP-NEXT: vmovdqa %ymm7, (%r8)
3285 ; AVX512BW-FCP-NEXT: vmovdqa %ymm8, (%r9)
3286 ; AVX512BW-FCP-NEXT: vmovdqa %ymm9, (%r11)
3287 ; AVX512BW-FCP-NEXT: vmovdqa %ymm10, (%r10)
3288 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
3289 ; AVX512BW-FCP-NEXT: vzeroupper
3290 ; AVX512BW-FCP-NEXT: retq
3292 ; AVX512DQ-BW-LABEL: load_i16_stride8_vf16:
3293 ; AVX512DQ-BW: # %bb.0:
3294 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
3295 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
3296 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
3297 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
3298 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
3299 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2
3300 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3
3301 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
3302 ; AVX512DQ-BW-NEXT: # ymm4 = mem[0,1,0,1]
3303 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm4
3304 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56]
3305 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
3306 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
3307 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
3308 ; AVX512DQ-BW-NEXT: # ymm5 = mem[0,1,0,1]
3309 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm5
3310 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57]
3311 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
3312 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
3313 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
3314 ; AVX512DQ-BW-NEXT: # ymm6 = mem[0,1,0,1]
3315 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
3316 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58]
3317 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
3318 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
3319 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
3320 ; AVX512DQ-BW-NEXT: # ymm7 = mem[0,1,0,1]
3321 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm7
3322 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59]
3323 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
3324 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
3325 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
3326 ; AVX512DQ-BW-NEXT: # ymm8 = mem[0,1,0,1]
3327 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm8
3328 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60]
3329 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm9
3330 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
3331 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
3332 ; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,0,1]
3333 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm9
3334 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61]
3335 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
3336 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
3337 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
3338 ; AVX512DQ-BW-NEXT: # ymm10 = mem[0,1,0,1]
3339 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm10
3340 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62]
3341 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm11
3342 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
3343 ; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
3344 ; AVX512DQ-BW-NEXT: # ymm11 = mem[0,1,0,1]
3345 ; AVX512DQ-BW-NEXT: vpermi2w %zmm3, %zmm2, %zmm11
3346 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63]
3347 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
3348 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
3349 ; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rsi)
3350 ; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%rdx)
3351 ; AVX512DQ-BW-NEXT: vmovdqa %ymm6, (%rcx)
3352 ; AVX512DQ-BW-NEXT: vmovdqa %ymm7, (%r8)
3353 ; AVX512DQ-BW-NEXT: vmovdqa %ymm8, (%r9)
3354 ; AVX512DQ-BW-NEXT: vmovdqa %ymm9, (%r11)
3355 ; AVX512DQ-BW-NEXT: vmovdqa %ymm10, (%r10)
3356 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax)
3357 ; AVX512DQ-BW-NEXT: vzeroupper
3358 ; AVX512DQ-BW-NEXT: retq
3360 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride8_vf16:
3361 ; AVX512DQ-BW-FCP: # %bb.0:
3362 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
3363 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
3364 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
3365 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
3366 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
3367 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
3368 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
3369 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
3370 ; AVX512DQ-BW-FCP-NEXT: # ymm4 = mem[0,1,0,1]
3371 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm4
3372 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm5 = [0,8,16,24,32,40,48,56]
3373 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
3374 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3],ymm4[4,5,6,7]
3375 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
3376 ; AVX512DQ-BW-FCP-NEXT: # ymm5 = mem[0,1,0,1]
3377 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm5
3378 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm6 = [1,9,17,25,33,41,49,57]
3379 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
3380 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
3381 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
3382 ; AVX512DQ-BW-FCP-NEXT: # ymm6 = mem[0,1,0,1]
3383 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm6
3384 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm7 = [2,10,18,26,34,42,50,58]
3385 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm7
3386 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
3387 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
3388 ; AVX512DQ-BW-FCP-NEXT: # ymm7 = mem[0,1,0,1]
3389 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm7
3390 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm8 = [3,11,19,27,35,43,51,59]
3391 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
3392 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
3393 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
3394 ; AVX512DQ-BW-FCP-NEXT: # ymm8 = mem[0,1,0,1]
3395 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm8
3396 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm9 = [4,12,20,28,36,44,52,60]
3397 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm9
3398 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
3399 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
3400 ; AVX512DQ-BW-FCP-NEXT: # ymm9 = mem[0,1,0,1]
3401 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm9
3402 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm10 = [5,13,21,29,37,45,53,61]
3403 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
3404 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
3405 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
3406 ; AVX512DQ-BW-FCP-NEXT: # ymm10 = mem[0,1,0,1]
3407 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm10
3408 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm11 = [6,14,22,30,38,46,54,62]
3409 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm11
3410 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
3411 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
3412 ; AVX512DQ-BW-FCP-NEXT: # ymm11 = mem[0,1,0,1]
3413 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm3, %zmm2, %zmm11
3414 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [7,15,23,31,39,47,55,63]
3415 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm2
3416 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm11[4,5,6,7]
3417 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rsi)
3418 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%rdx)
3419 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm6, (%rcx)
3420 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm7, (%r8)
3421 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm8, (%r9)
3422 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm9, (%r11)
3423 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm10, (%r10)
3424 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rax)
3425 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
3426 ; AVX512DQ-BW-FCP-NEXT: retq
3427 %wide.vec = load <128 x i16>, ptr %in.vec, align 64
3428 %strided.vec0 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <16 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120>
3429 %strided.vec1 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <16 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121>
3430 %strided.vec2 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <16 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 66, i32 74, i32 82, i32 90, i32 98, i32 106, i32 114, i32 122>
3431 %strided.vec3 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <16 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 67, i32 75, i32 83, i32 91, i32 99, i32 107, i32 115, i32 123>
3432 %strided.vec4 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <16 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 68, i32 76, i32 84, i32 92, i32 100, i32 108, i32 116, i32 124>
3433 %strided.vec5 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <16 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125>
3434 %strided.vec6 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <16 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126>
3435 %strided.vec7 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <16 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127>
3436 store <16 x i16> %strided.vec0, ptr %out.vec0, align 64
3437 store <16 x i16> %strided.vec1, ptr %out.vec1, align 64
3438 store <16 x i16> %strided.vec2, ptr %out.vec2, align 64
3439 store <16 x i16> %strided.vec3, ptr %out.vec3, align 64
3440 store <16 x i16> %strided.vec4, ptr %out.vec4, align 64
3441 store <16 x i16> %strided.vec5, ptr %out.vec5, align 64
3442 store <16 x i16> %strided.vec6, ptr %out.vec6, align 64
3443 store <16 x i16> %strided.vec7, ptr %out.vec7, align 64
3447 define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
3448 ; SSE-LABEL: load_i16_stride8_vf32:
3450 ; SSE-NEXT: subq $696, %rsp # imm = 0x2B8
3451 ; SSE-NEXT: movdqa 496(%rdi), %xmm2
3452 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3453 ; SSE-NEXT: movdqa 480(%rdi), %xmm14
3454 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3455 ; SSE-NEXT: movdqa 208(%rdi), %xmm1
3456 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3457 ; SSE-NEXT: movdqa 192(%rdi), %xmm3
3458 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3459 ; SSE-NEXT: movdqa 240(%rdi), %xmm5
3460 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3461 ; SSE-NEXT: movdqa 224(%rdi), %xmm15
3462 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3463 ; SSE-NEXT: movdqa 144(%rdi), %xmm6
3464 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3465 ; SSE-NEXT: movdqa 128(%rdi), %xmm4
3466 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3467 ; SSE-NEXT: movdqa 176(%rdi), %xmm7
3468 ; SSE-NEXT: movdqa %xmm7, (%rsp) # 16-byte Spill
3469 ; SSE-NEXT: movdqa 160(%rdi), %xmm0
3470 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3471 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3]
3472 ; SSE-NEXT: movdqa %xmm0, %xmm7
3473 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3474 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
3475 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3476 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3]
3477 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3478 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[0,0,0,0]
3479 ; SSE-NEXT: movdqa %xmm3, %xmm15
3480 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
3481 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[0,0,0,0]
3482 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3483 ; SSE-NEXT: movdqa %xmm4, %xmm0
3484 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
3485 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3486 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3487 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3]
3488 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3489 ; SSE-NEXT: movdqa 464(%rdi), %xmm0
3490 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3491 ; SSE-NEXT: movdqa 448(%rdi), %xmm8
3492 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3493 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
3494 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[0,0,0,0]
3495 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,0,0]
3496 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3497 ; SSE-NEXT: movdqa 432(%rdi), %xmm0
3498 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3499 ; SSE-NEXT: movdqa 416(%rdi), %xmm11
3500 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3501 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
3502 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3503 ; SSE-NEXT: movdqa 400(%rdi), %xmm0
3504 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3505 ; SSE-NEXT: movdqa 384(%rdi), %xmm10
3506 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3507 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
3508 ; SSE-NEXT: movdqa %xmm10, %xmm0
3509 ; SSE-NEXT: movdqa %xmm10, %xmm14
3510 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3511 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
3512 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3513 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3514 ; SSE-NEXT: movdqa 368(%rdi), %xmm0
3515 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3516 ; SSE-NEXT: movdqa 352(%rdi), %xmm9
3517 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3518 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
3519 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3520 ; SSE-NEXT: movdqa 336(%rdi), %xmm0
3521 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3522 ; SSE-NEXT: movdqa 320(%rdi), %xmm12
3523 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3524 ; SSE-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
3525 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[0,0,0,0]
3526 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[0,0,0,0]
3527 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3528 ; SSE-NEXT: movdqa 304(%rdi), %xmm0
3529 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3530 ; SSE-NEXT: movdqa 288(%rdi), %xmm11
3531 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3532 ; SSE-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3]
3533 ; SSE-NEXT: movdqa 272(%rdi), %xmm0
3534 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3535 ; SSE-NEXT: movdqa 256(%rdi), %xmm10
3536 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3537 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
3538 ; SSE-NEXT: movdqa %xmm10, %xmm0
3539 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
3540 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3541 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3542 ; SSE-NEXT: movdqa 112(%rdi), %xmm0
3543 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3544 ; SSE-NEXT: movdqa 96(%rdi), %xmm9
3545 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3546 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
3547 ; SSE-NEXT: movdqa 80(%rdi), %xmm0
3548 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3549 ; SSE-NEXT: movdqa 64(%rdi), %xmm13
3550 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3551 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
3552 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,0,0]
3553 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,0,0,0]
3554 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm1[2],xmm3[3],xmm1[3]
3555 ; SSE-NEXT: movdqa 32(%rdi), %xmm7
3556 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3557 ; SSE-NEXT: movdqa 48(%rdi), %xmm0
3558 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3559 ; SSE-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
3560 ; SSE-NEXT: movdqa (%rdi), %xmm6
3561 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3562 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
3563 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3564 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
3565 ; SSE-NEXT: movdqa %xmm6, %xmm1
3566 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1]
3567 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
3568 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3569 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3570 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1]
3571 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3572 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[1,1,1,1]
3573 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3574 ; SSE-NEXT: movdqa %xmm15, %xmm0
3575 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3576 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3577 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
3578 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3579 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[1,1,1,1]
3580 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3581 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
3582 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3583 ; SSE-NEXT: movdqa %xmm8, %xmm0
3584 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3585 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
3586 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
3587 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3588 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1]
3589 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1]
3590 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3591 ; SSE-NEXT: movdqa %xmm12, %xmm0
3592 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
3593 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1]
3594 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
3595 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3596 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1]
3597 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
3598 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3599 ; SSE-NEXT: movdqa %xmm13, %xmm0
3600 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
3601 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
3602 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3603 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
3604 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,2,2]
3605 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3606 ; SSE-NEXT: movdqa %xmm4, %xmm0
3607 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
3608 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3609 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3610 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2]
3611 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,2,2]
3612 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3613 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3614 ; SSE-NEXT: movapd %xmm3, %xmm0
3615 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3616 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
3617 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3618 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3619 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
3620 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[2,2,2,2]
3621 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3622 ; SSE-NEXT: movdqa %xmm10, %xmm0
3623 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm11[2],xmm0[3],xmm11[3]
3624 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3625 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3626 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,2,2]
3627 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2]
3628 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
3629 ; SSE-NEXT: movdqa %xmm6, %xmm1
3630 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm7[2],xmm1[3],xmm7[3]
3631 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
3632 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3633 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3634 ; SSE-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
3635 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3]
3636 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3]
3637 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3638 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm15[2,3]
3639 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3640 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
3641 ; SSE-NEXT: # xmm8 = xmm8[2],mem[2],xmm8[3],mem[3]
3642 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[3,3,3,3]
3643 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3]
3644 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3645 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3]
3646 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3647 ; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm14[2],xmm12[3],xmm14[3]
3648 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[3,3,3,3]
3649 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[3,3,3,3]
3650 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3651 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3]
3652 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3653 ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm9[2],xmm13[3],xmm9[3]
3654 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3]
3655 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[3,3,3,3]
3656 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3657 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm13[2,3]
3658 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3659 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3660 ; SSE-NEXT: punpckhwd (%rsp), %xmm3 # 16-byte Folded Reload
3661 ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
3662 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
3663 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3664 ; SSE-NEXT: # xmm15 = xmm15[4],mem[4],xmm15[5],mem[5],xmm15[6],mem[6],xmm15[7],mem[7]
3665 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3666 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3667 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
3668 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3669 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3670 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
3671 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
3672 ; SSE-NEXT: movdqa %xmm1, %xmm12
3673 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
3674 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3675 ; SSE-NEXT: movdqa %xmm15, %xmm0
3676 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
3677 ; SSE-NEXT: movdqa %xmm3, %xmm7
3678 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3679 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
3680 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3681 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3682 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3683 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
3684 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3685 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3686 ; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
3687 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3688 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
3689 ; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7]
3690 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3691 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3692 ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
3693 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3694 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[0,0,0,0]
3695 ; SSE-NEXT: movdqa %xmm8, %xmm11
3696 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,0,0]
3697 ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
3698 ; SSE-NEXT: movdqa %xmm4, %xmm0
3699 ; SSE-NEXT: movdqa %xmm4, %xmm14
3700 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3701 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3702 ; SSE-NEXT: movdqa %xmm1, %xmm13
3703 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3704 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
3705 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3706 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
3707 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
3708 ; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7]
3709 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
3710 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
3711 ; SSE-NEXT: # xmm9 = xmm9[4],mem[4],xmm9[5],mem[5],xmm9[6],mem[6],xmm9[7],mem[7]
3712 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3713 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3714 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
3715 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3716 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3717 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3718 ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
3719 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3720 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3721 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,0,0,0]
3722 ; SSE-NEXT: punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
3723 ; SSE-NEXT: movdqa %xmm9, %xmm0
3724 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
3725 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm0[0],xmm5[1]
3726 ; SSE-NEXT: movapd %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3727 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3728 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3729 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
3730 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3731 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3732 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
3733 ; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7]
3734 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3735 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[0,0,0,0]
3736 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3737 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3738 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
3739 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3740 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
3741 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3742 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
3743 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3744 ; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
3745 ; SSE-NEXT: movdqa %xmm4, %xmm3
3746 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
3747 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
3748 ; SSE-NEXT: movaps %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3749 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[1,1,1,1]
3750 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
3751 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3752 ; SSE-NEXT: movdqa %xmm2, %xmm0
3753 ; SSE-NEXT: movdqa %xmm2, %xmm5
3754 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3755 ; SSE-NEXT: movdqa %xmm12, %xmm3
3756 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3757 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
3758 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
3759 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3760 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm14[1,1,1,1]
3761 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1]
3762 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3763 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3764 ; SSE-NEXT: movdqa %xmm1, %xmm0
3765 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
3766 ; SSE-NEXT: movdqa %xmm11, %xmm6
3767 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3768 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
3769 ; SSE-NEXT: movaps %xmm2, (%rsp) # 16-byte Spill
3770 ; SSE-NEXT: movdqa %xmm9, %xmm14
3771 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3772 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[1,1,1,1]
3773 ; SSE-NEXT: movdqa %xmm10, %xmm13
3774 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3775 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
3776 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
3777 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3778 ; SSE-NEXT: movaps %xmm7, %xmm0
3779 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
3780 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
3781 ; SSE-NEXT: shufps {{.*#+}} xmm9 = xmm9[0,1],xmm0[2,3]
3782 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm4[1,1,1,1]
3783 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3784 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,1,1]
3785 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
3786 ; SSE-NEXT: movdqa %xmm8, %xmm0
3787 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3788 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
3789 ; SSE-NEXT: shufps {{.*#+}} xmm10 = xmm10[0,1],xmm0[2,3]
3790 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
3791 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm5[2,2,2,2]
3792 ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3]
3793 ; SSE-NEXT: movdqa %xmm15, %xmm0
3794 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3795 ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
3796 ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1]
3797 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[2,2,2,2]
3798 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm7[2,2,2,2]
3799 ; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm0[2],xmm12[3],xmm0[3]
3800 ; SSE-NEXT: movdqa %xmm14, %xmm0
3801 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm13[2],xmm0[3],xmm13[3]
3802 ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm0[0],xmm12[1]
3803 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[2,2,2,2]
3804 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm1[2,2,2,2]
3805 ; SSE-NEXT: movdqa %xmm1, %xmm3
3806 ; SSE-NEXT: punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm0[2],xmm13[3],xmm0[3]
3807 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
3808 ; SSE-NEXT: movapd %xmm6, %xmm0
3809 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
3810 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm5[2],xmm0[3],xmm5[3]
3811 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm0[0],xmm13[1]
3812 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[2,2,2,2]
3813 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
3814 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2]
3815 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3816 ; SSE-NEXT: movdqa %xmm4, %xmm14
3817 ; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm2[2],xmm14[3],xmm2[3]
3818 ; SSE-NEXT: movdqa %xmm2, %xmm8
3819 ; SSE-NEXT: shufps {{.*#+}} xmm14 = xmm14[0,1],xmm1[2,3]
3820 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3821 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3822 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
3823 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm15[3,3,3,3]
3824 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3825 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
3826 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
3827 ; SSE-NEXT: shufps {{.*#+}} xmm15 = xmm15[0,1],xmm1[2,3]
3828 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3829 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3830 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
3831 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3832 ; SSE-NEXT: # xmm2 = mem[3,3,3,3]
3833 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3834 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
3835 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3836 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
3837 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3838 ; SSE-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
3839 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,3,3,3]
3840 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[3,3,3,3]
3841 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3842 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
3843 ; SSE-NEXT: punpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
3844 ; SSE-NEXT: # xmm7 = xmm7[2],mem[2],xmm7[3],mem[3]
3845 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[3,3,3,3]
3846 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[3,3,3,3]
3847 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
3848 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
3849 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3850 ; SSE-NEXT: movaps %xmm3, 32(%rsi)
3851 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3852 ; SSE-NEXT: movaps %xmm3, 48(%rsi)
3853 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3854 ; SSE-NEXT: movaps %xmm3, (%rsi)
3855 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
3856 ; SSE-NEXT: movaps %xmm8, 16(%rsi)
3857 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3858 ; SSE-NEXT: movaps %xmm3, 32(%rdx)
3859 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3860 ; SSE-NEXT: movaps %xmm3, 48(%rdx)
3861 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3862 ; SSE-NEXT: movaps %xmm3, (%rdx)
3863 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3864 ; SSE-NEXT: movaps %xmm3, 16(%rdx)
3865 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3866 ; SSE-NEXT: movaps %xmm3, 32(%rcx)
3867 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3868 ; SSE-NEXT: movaps %xmm3, 48(%rcx)
3869 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3870 ; SSE-NEXT: movaps %xmm3, (%rcx)
3871 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3872 ; SSE-NEXT: movaps %xmm3, 16(%rcx)
3873 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3874 ; SSE-NEXT: movaps %xmm3, 32(%r8)
3875 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3876 ; SSE-NEXT: movaps %xmm3, 48(%r8)
3877 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3878 ; SSE-NEXT: movaps %xmm3, (%r8)
3879 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3880 ; SSE-NEXT: movaps %xmm3, 16(%r8)
3881 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3882 ; SSE-NEXT: movaps %xmm3, 32(%r9)
3883 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3884 ; SSE-NEXT: movaps %xmm3, 48(%r9)
3885 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3886 ; SSE-NEXT: movaps %xmm3, (%r9)
3887 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3888 ; SSE-NEXT: movaps %xmm3, 16(%r9)
3889 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
3890 ; SSE-NEXT: movaps %xmm9, 32(%rax)
3891 ; SSE-NEXT: movaps (%rsp), %xmm3 # 16-byte Reload
3892 ; SSE-NEXT: movaps %xmm3, 48(%rax)
3893 ; SSE-NEXT: movaps %xmm10, (%rax)
3894 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
3895 ; SSE-NEXT: movaps %xmm3, 16(%rax)
3896 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
3897 ; SSE-NEXT: movapd %xmm13, 48(%rax)
3898 ; SSE-NEXT: movapd %xmm12, 32(%rax)
3899 ; SSE-NEXT: movapd %xmm11, 16(%rax)
3900 ; SSE-NEXT: movaps %xmm14, (%rax)
3901 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
3902 ; SSE-NEXT: movaps %xmm1, 48(%rax)
3903 ; SSE-NEXT: movaps %xmm2, 32(%rax)
3904 ; SSE-NEXT: movaps %xmm15, 16(%rax)
3905 ; SSE-NEXT: movaps %xmm0, (%rax)
3906 ; SSE-NEXT: addq $696, %rsp # imm = 0x2B8
3909 ; AVX-LABEL: load_i16_stride8_vf32:
3911 ; AVX-NEXT: subq $872, %rsp # imm = 0x368
3912 ; AVX-NEXT: vmovdqa 304(%rdi), %xmm0
3913 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3914 ; AVX-NEXT: vmovdqa 288(%rdi), %xmm1
3915 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3916 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3917 ; AVX-NEXT: vmovdqa 272(%rdi), %xmm0
3918 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3919 ; AVX-NEXT: vmovdqa 256(%rdi), %xmm1
3920 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3921 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3922 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
3923 ; AVX-NEXT: vmovdqa 368(%rdi), %xmm1
3924 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3925 ; AVX-NEXT: vmovdqa 352(%rdi), %xmm2
3926 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3927 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3928 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,0,0,0]
3929 ; AVX-NEXT: vmovdqa 336(%rdi), %xmm2
3930 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3931 ; AVX-NEXT: vmovdqa 320(%rdi), %xmm3
3932 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3933 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
3934 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,1,0,1]
3935 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
3936 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
3937 ; AVX-NEXT: vmovdqa 496(%rdi), %xmm1
3938 ; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
3939 ; AVX-NEXT: vmovdqa 480(%rdi), %xmm2
3940 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3941 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3942 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,0,0,0]
3943 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3944 ; AVX-NEXT: vmovdqa 464(%rdi), %xmm2
3945 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3946 ; AVX-NEXT: vmovdqa 448(%rdi), %xmm3
3947 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3948 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
3949 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm10[0,1,0,1]
3950 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
3951 ; AVX-NEXT: vmovdqa 432(%rdi), %xmm2
3952 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3953 ; AVX-NEXT: vmovdqa 416(%rdi), %xmm3
3954 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3955 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
3956 ; AVX-NEXT: vmovdqa 400(%rdi), %xmm2
3957 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3958 ; AVX-NEXT: vmovdqa 384(%rdi), %xmm3
3959 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3960 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
3961 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm11[0],xmm13[0],xmm11[1],xmm13[1]
3962 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
3963 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
3964 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
3965 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
3966 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3967 ; AVX-NEXT: vmovdqa 240(%rdi), %xmm0
3968 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3969 ; AVX-NEXT: vmovdqa 224(%rdi), %xmm1
3970 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3971 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
3972 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3973 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
3974 ; AVX-NEXT: vmovdqa 208(%rdi), %xmm1
3975 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3976 ; AVX-NEXT: vmovdqa 192(%rdi), %xmm2
3977 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3978 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3979 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3980 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
3981 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
3982 ; AVX-NEXT: vmovdqa 176(%rdi), %xmm1
3983 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3984 ; AVX-NEXT: vmovdqa 160(%rdi), %xmm2
3985 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3986 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3987 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3988 ; AVX-NEXT: vmovdqa 144(%rdi), %xmm1
3989 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3990 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm2
3991 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3992 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
3993 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3994 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
3995 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
3996 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
3997 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7]
3998 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm0
3999 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4000 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm1
4001 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4002 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4003 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4004 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
4005 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm1
4006 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4007 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm2
4008 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4009 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
4010 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4011 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
4012 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
4013 ; AVX-NEXT: vmovdqa (%rdi), %xmm15
4014 ; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4015 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
4016 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4017 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm2
4018 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4019 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm4
4020 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4021 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
4022 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm15[0],xmm1[0],xmm15[1],xmm1[1],xmm15[2],xmm1[2],xmm15[3],xmm1[3]
4023 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm9[0],xmm12[0],xmm9[1],xmm12[1]
4024 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4025 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4,5,6,7]
4026 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4027 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4028 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4029 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
4030 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5,6,7]
4031 ; AVX-NEXT: vmovdqa %xmm6, %xmm4
4032 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4033 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm6[0],xmm14[1],xmm6[1]
4034 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
4035 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm10[0],xmm7[0],xmm10[1],xmm7[1]
4036 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4037 ; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4038 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[1,1,1,1]
4039 ; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm15[0,1],xmm13[2,3],xmm15[4,5,6,7]
4040 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
4041 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7]
4042 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4043 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4044 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4045 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4046 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
4047 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4048 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4049 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm2[1,1,1,1]
4050 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4051 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
4052 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4053 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
4054 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[1,1,1,1]
4055 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm12[2,3],xmm3[4,5,6,7]
4056 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4057 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4058 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1]
4059 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm15[4,5,6,7]
4060 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
4061 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4062 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,2,2]
4063 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3,4,5],xmm0[6,7]
4064 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm5[2],xmm8[3],xmm5[3]
4065 ; AVX-NEXT: vmovdqa %xmm5, %xmm4
4066 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
4067 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm11[2],xmm13[2],xmm11[3],xmm13[3]
4068 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4069 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4070 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm11[2,2,2,2]
4071 ; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm10[0,1,2,3,4,5],xmm15[6,7]
4072 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
4073 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7]
4074 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4075 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4076 ; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
4077 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4078 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm6[2,2,2,2]
4079 ; AVX-NEXT: vmovaps %xmm6, %xmm5
4080 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm7[0,1,2],xmm3[3]
4081 ; AVX-NEXT: vmovaps %xmm7, %xmm6
4082 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4083 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
4084 ; AVX-NEXT: vmovdqa %xmm9, %xmm7
4085 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm9[2,2,2,2]
4086 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4087 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0,1,2,3,4,5],xmm3[6,7]
4088 ; AVX-NEXT: vmovdqa %xmm12, %xmm9
4089 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4090 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm12[2],xmm9[2],xmm12[3],xmm9[3]
4091 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm3[4,5,6,7]
4092 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
4093 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4094 ; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm0 # 16-byte Folded Reload
4095 ; AVX-NEXT: # xmm0 = xmm14[2],mem[2],xmm14[3],mem[3]
4096 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[2,3,2,3]
4097 ; AVX-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
4098 ; AVX-NEXT: # xmm14 = mem[3,3,3,3]
4099 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm14[0,1],xmm3[2,3],xmm14[4,5,6,7]
4100 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
4101 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm10[2],xmm11[2],xmm10[3],xmm11[3]
4102 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm13[2,3,2,3]
4103 ; AVX-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
4104 ; AVX-NEXT: # xmm10 = mem[3,3,3,3]
4105 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm10[0,1],xmm11[2,3],xmm10[4,5,6,7]
4106 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4107 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
4108 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7]
4109 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4110 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4111 ; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
4112 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm1[2,3,2,3]
4113 ; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm2[3,3,3,3]
4114 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3]
4115 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4116 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4117 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
4118 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm7[2],xmm8[3],xmm7[3]
4119 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[2,3,2,3]
4120 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[3,3,3,3]
4121 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
4122 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
4123 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4124 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4125 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4126 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
4127 ; AVX-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4128 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4129 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
4130 ; AVX-NEXT: # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4131 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4132 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
4133 ; AVX-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4134 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4135 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
4136 ; AVX-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4137 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[0,0,0,0]
4138 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1]
4139 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4140 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
4141 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm6[0],xmm10[1],xmm6[1]
4142 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
4143 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4144 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
4145 ; AVX-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4146 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4147 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4148 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
4149 ; AVX-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4150 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4151 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4152 ; AVX-NEXT: vpunpckhwd (%rsp), %xmm1, %xmm2 # 16-byte Folded Reload
4153 ; AVX-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4154 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4155 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4156 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
4157 ; AVX-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4158 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4159 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
4160 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1]
4161 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
4162 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
4163 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
4164 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
4165 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
4166 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4167 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4168 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4169 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
4170 ; AVX-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4171 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4172 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4173 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
4174 ; AVX-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4175 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4176 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
4177 ; AVX-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4178 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4179 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
4180 ; AVX-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4181 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm12[0,0,0,0]
4182 ; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4183 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm9[0,1,0,1]
4184 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4185 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
4186 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm2[0],xmm11[1],xmm2[1]
4187 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
4188 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4189 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4190 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4191 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
4192 ; AVX-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4193 ; AVX-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
4194 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4195 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
4196 ; AVX-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4197 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4198 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
4199 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
4200 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm1[6,7]
4201 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4202 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm13 # 16-byte Folded Reload
4203 ; AVX-NEXT: # xmm13 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4204 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4205 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload
4206 ; AVX-NEXT: # xmm14 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4207 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
4208 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4,5,6,7]
4209 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4210 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4211 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
4212 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4213 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3],xmm0[4,5,6,7]
4214 ; AVX-NEXT: vmovdqa %xmm8, %xmm5
4215 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4216 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
4217 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
4218 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4219 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4220 ; AVX-NEXT: vunpcklps {{.*#+}} xmm3 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
4221 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4222 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
4223 ; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm4[1,1,1,1]
4224 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4225 ; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm1[1],xmm15[2,3]
4226 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
4227 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7]
4228 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4229 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4230 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm12[0],xmm9[1],xmm12[1]
4231 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4232 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[1,1,1,1]
4233 ; AVX-NEXT: vmovdqa %xmm11, %xmm9
4234 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4235 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm11[2,3],xmm3[4,5,6,7]
4236 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4237 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
4238 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm14[1,1,1,1]
4239 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm13[2,3],xmm3[4,5,6,7]
4240 ; AVX-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload
4241 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4242 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm12[0],xmm2[0],xmm12[1],xmm2[1]
4243 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm15[4,5,6,7]
4244 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
4245 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4246 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
4247 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4248 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0,1,2,3,4,5],xmm0[6,7]
4249 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm10[2],xmm6[2],xmm10[3],xmm6[3]
4250 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
4251 ; AVX-NEXT: vunpckhps {{.*#+}} xmm3 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
4252 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4253 ; AVX-NEXT: vmovaps %xmm8, %xmm6
4254 ; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm8[2,2,2,2]
4255 ; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm7[0,1,2],xmm15[3]
4256 ; AVX-NEXT: vmovaps %xmm7, %xmm8
4257 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
4258 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7]
4259 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4260 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4261 ; AVX-NEXT: vmovdqa %xmm9, %xmm4
4262 ; AVX-NEXT: vmovdqa %xmm11, %xmm1
4263 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
4264 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4265 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4266 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm9[2,2,2,2]
4267 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
4268 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm11[0,1,2],xmm3[3]
4269 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4270 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
4271 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,2,2]
4272 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm12[0,1,2,3,4,5],xmm3[6,7]
4273 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
4274 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm3[4,5,6,7]
4275 ; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm3[0,1,2,3],ymm0[4,5,6,7]
4276 ; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload
4277 ; AVX-NEXT: # xmm0 = xmm5[2],mem[2],xmm5[3],mem[3]
4278 ; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4279 ; AVX-NEXT: # xmm3 = mem[2,3,2,3]
4280 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[3,3,3,3]
4281 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm7[0,1],xmm3[2,3],xmm7[4,5,6,7]
4282 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
4283 ; AVX-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm6[2],xmm8[3],xmm6[3]
4284 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4285 ; AVX-NEXT: # xmm5 = mem[2,3,2,3]
4286 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
4287 ; AVX-NEXT: # xmm6 = mem[3,3,3,3]
4288 ; AVX-NEXT: vblendps {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3]
4289 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4290 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
4291 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
4292 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4293 ; AVX-NEXT: vunpckhps {{.*#+}} xmm3 = xmm11[2],xmm9[2],xmm11[3],xmm9[3]
4294 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,2,3]
4295 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[3,3,3,3]
4296 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
4297 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
4298 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
4299 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
4300 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm12[2],xmm2[2],xmm12[3],xmm2[3]
4301 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[2,3,2,3]
4302 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[3,3,3,3]
4303 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
4304 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm4[4,5,6,7]
4305 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
4306 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4307 ; AVX-NEXT: vmovaps %ymm2, (%rsi)
4308 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4309 ; AVX-NEXT: vmovaps %ymm2, 32(%rsi)
4310 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4311 ; AVX-NEXT: vmovaps %ymm2, (%rdx)
4312 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4313 ; AVX-NEXT: vmovaps %ymm2, 32(%rdx)
4314 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4315 ; AVX-NEXT: vmovaps %ymm2, (%rcx)
4316 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4317 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx)
4318 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4319 ; AVX-NEXT: vmovaps %ymm2, (%r8)
4320 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4321 ; AVX-NEXT: vmovaps %ymm2, 32(%r8)
4322 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4323 ; AVX-NEXT: vmovaps %ymm2, (%r9)
4324 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4325 ; AVX-NEXT: vmovaps %ymm2, 32(%r9)
4326 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
4327 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4328 ; AVX-NEXT: vmovaps %ymm2, (%rax)
4329 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4330 ; AVX-NEXT: vmovaps %ymm2, 32(%rax)
4331 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
4332 ; AVX-NEXT: vmovaps %ymm15, (%rax)
4333 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4334 ; AVX-NEXT: vmovaps %ymm2, 32(%rax)
4335 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
4336 ; AVX-NEXT: vmovaps %ymm1, (%rax)
4337 ; AVX-NEXT: vmovaps %ymm0, 32(%rax)
4338 ; AVX-NEXT: addq $872, %rsp # imm = 0x368
4339 ; AVX-NEXT: vzeroupper
4342 ; AVX2-LABEL: load_i16_stride8_vf32:
4344 ; AVX2-NEXT: subq $1000, %rsp # imm = 0x3E8
4345 ; AVX2-NEXT: vmovdqa 448(%rdi), %ymm2
4346 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4347 ; AVX2-NEXT: vmovdqa 480(%rdi), %ymm3
4348 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4349 ; AVX2-NEXT: vmovdqa 272(%rdi), %xmm0
4350 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4351 ; AVX2-NEXT: vmovdqa 256(%rdi), %xmm1
4352 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4353 ; AVX2-NEXT: vmovdqa 304(%rdi), %xmm4
4354 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4355 ; AVX2-NEXT: vmovdqa 288(%rdi), %xmm5
4356 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4357 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
4358 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4359 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4360 ; AVX2-NEXT: vmovdqa 368(%rdi), %xmm0
4361 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4362 ; AVX2-NEXT: vmovdqa 352(%rdi), %xmm1
4363 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4364 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4365 ; AVX2-NEXT: vpbroadcastd %xmm9, %xmm0
4366 ; AVX2-NEXT: vmovdqa 336(%rdi), %xmm1
4367 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4368 ; AVX2-NEXT: vmovdqa 320(%rdi), %xmm4
4369 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4370 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
4371 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4372 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
4373 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4374 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
4375 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
4376 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,0,2]
4377 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4378 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
4379 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4380 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
4381 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
4382 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4383 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
4384 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4385 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
4386 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
4387 ; AVX2-NEXT: vmovdqa 384(%rdi), %ymm2
4388 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4389 ; AVX2-NEXT: vmovdqa 416(%rdi), %ymm3
4390 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4391 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2]
4392 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4393 ; AVX2-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[0,2,2,3,4,6,6,7]
4394 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
4395 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
4396 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4397 ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7]
4398 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
4399 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
4400 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
4401 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4402 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4403 ; AVX2-NEXT: vmovdqa 112(%rdi), %xmm0
4404 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4405 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm1
4406 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4407 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4408 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4409 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
4410 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm1
4411 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4412 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2
4413 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4414 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
4415 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4416 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
4417 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4418 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
4419 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4420 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2
4421 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4422 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm3
4423 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4424 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm4
4425 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4426 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
4427 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
4428 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm15[0],xmm10[1],xmm15[1]
4429 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3]
4430 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm0
4431 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4432 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm1
4433 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4434 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
4435 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4436 ; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,2,0,4,5,6,4]
4437 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
4438 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
4439 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4440 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4]
4441 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm14 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
4442 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm11[7]
4443 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm2
4444 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4445 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm0
4446 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4447 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
4448 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4449 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
4450 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
4451 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
4452 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4453 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
4454 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
4455 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7]
4456 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
4457 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
4458 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4459 ; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4460 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
4461 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4462 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3]
4463 ; AVX2-NEXT: vmovdqa %xmm9, %xmm14
4464 ; AVX2-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill
4465 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4466 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm14[0],xmm9[1],xmm14[1]
4467 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
4468 ; AVX2-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
4469 ; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
4470 ; AVX2-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
4471 ; AVX2-NEXT: # ymm6 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
4472 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7]
4473 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm6 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
4474 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
4475 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
4476 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
4477 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4478 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4479 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
4480 ; AVX2-NEXT: vmovdqa %xmm10, %xmm11
4481 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3]
4482 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4483 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4484 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
4485 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
4486 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
4487 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
4488 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
4489 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
4490 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
4491 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
4492 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
4493 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4494 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4495 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
4496 ; AVX2-NEXT: vmovdqa %xmm9, %xmm3
4497 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
4498 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm8[2],xmm5[2],xmm8[3],xmm5[3]
4499 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3]
4500 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
4501 ; AVX2-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7]
4502 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
4503 ; AVX2-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
4504 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
4505 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
4506 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,6],ymm1[7]
4507 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
4508 ; AVX2-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7]
4509 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
4510 ; AVX2-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
4511 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
4512 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
4513 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
4514 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
4515 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
4516 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4517 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[2,2,2,2]
4518 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1,2],xmm4[3]
4519 ; AVX2-NEXT: vmovdqa %xmm11, %xmm12
4520 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm15[2],xmm11[3],xmm15[3]
4521 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
4522 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
4523 ; AVX2-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7]
4524 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
4525 ; AVX2-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
4526 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
4527 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm6 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
4528 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
4529 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
4530 ; AVX2-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7]
4531 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
4532 ; AVX2-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
4533 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm13 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
4534 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm14 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
4535 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
4536 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7]
4537 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
4538 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4539 ; AVX2-NEXT: vpunpckhdq (%rsp), %xmm3, %xmm4 # 16-byte Folded Reload
4540 ; AVX2-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3]
4541 ; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4542 ; AVX2-NEXT: # xmm5 = mem[2,3,2,3]
4543 ; AVX2-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
4544 ; AVX2-NEXT: # xmm13 = mem[3,3,3,3]
4545 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0],xmm5[1],xmm13[2,3]
4546 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
4547 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
4548 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
4549 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
4550 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
4551 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
4552 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
4553 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4554 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
4555 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4556 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload
4557 ; AVX2-NEXT: # xmm0 = xmm10[2],mem[2],xmm10[3],mem[3]
4558 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[2,3,2,3]
4559 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,3,3,3]
4560 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
4561 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
4562 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
4563 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
4564 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
4565 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
4566 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
4567 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
4568 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
4569 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4570 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4571 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4572 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
4573 ; AVX2-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4574 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4575 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4576 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
4577 ; AVX2-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4578 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4579 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4580 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
4581 ; AVX2-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4582 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4583 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4584 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
4585 ; AVX2-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4586 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4587 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm0
4588 ; AVX2-NEXT: vpbroadcastd %xmm4, %xmm1
4589 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4590 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4591 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
4592 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
4593 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3]
4594 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4595 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
4596 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3]
4597 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4598 ; AVX2-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,2,0,4,5,6,4]
4599 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
4600 ; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4]
4601 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
4602 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
4603 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
4604 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3]
4605 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4606 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
4607 ; AVX2-NEXT: # ymm3 = mem[0,1,1,3]
4608 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4609 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,2,3,4,6,6,7]
4610 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm6 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
4611 ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,2,3,4,6,6,7]
4612 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
4613 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
4614 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
4615 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4616 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4617 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4618 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
4619 ; AVX2-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4620 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4621 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
4622 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
4623 ; AVX2-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
4624 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm0
4625 ; AVX2-NEXT: vpbroadcastd %xmm11, %xmm1
4626 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4627 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4628 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
4629 ; AVX2-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4630 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4631 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
4632 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload
4633 ; AVX2-NEXT: # xmm14 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
4634 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm2[0],xmm14[1],xmm2[1]
4635 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3]
4636 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
4637 ; AVX2-NEXT: # ymm0 = mem[0,1,1,3]
4638 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4639 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
4640 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3]
4641 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4642 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4]
4643 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
4644 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,0,4,5,6,4]
4645 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm13 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
4646 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
4647 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
4648 ; AVX2-NEXT: # ymm0 = mem[0,1,1,3]
4649 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4650 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
4651 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3]
4652 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4653 ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,2,2,3,4,6,6,7]
4654 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
4655 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
4656 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
4657 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
4658 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7]
4659 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
4660 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4661 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4662 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[1,1,1,1]
4663 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4664 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3]
4665 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4666 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4667 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm13[0],xmm2[1],xmm13[1]
4668 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
4669 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
4670 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
4671 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5,6],ymm5[7]
4672 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
4673 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
4674 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7]
4675 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
4676 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
4677 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4678 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[1,1,1,1]
4679 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
4680 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3]
4681 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4682 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
4683 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
4684 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
4685 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
4686 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
4687 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
4688 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
4689 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
4690 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
4691 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4692 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4693 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[2,2,2,2]
4694 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
4695 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm15[2],xmm6[2],xmm15[3],xmm6[3]
4696 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
4697 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
4698 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
4699 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
4700 ; AVX2-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
4701 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
4702 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
4703 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
4704 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
4705 ; AVX2-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
4706 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
4707 ; AVX2-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7]
4708 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm8 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
4709 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm5[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
4710 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
4711 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
4712 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4713 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4714 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,2,2,2]
4715 ; AVX2-NEXT: vmovdqa %xmm11, %xmm7
4716 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
4717 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm12[2],xmm14[3],xmm12[3]
4718 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm1[0,1],xmm0[2,3]
4719 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
4720 ; AVX2-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7]
4721 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
4722 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
4723 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
4724 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm10 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
4725 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7]
4726 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
4727 ; AVX2-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
4728 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
4729 ; AVX2-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
4730 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm13 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
4731 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
4732 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
4733 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7]
4734 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
4735 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4736 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
4737 ; AVX2-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3]
4738 ; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[2,3,2,3]
4739 ; AVX2-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4740 ; AVX2-NEXT: # xmm15 = mem[3,3,3,3]
4741 ; AVX2-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2,3]
4742 ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3]
4743 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
4744 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
4745 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
4746 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
4747 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
4748 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
4749 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
4750 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
4751 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload
4752 ; AVX2-NEXT: # xmm3 = xmm7[2],mem[2],xmm7[3],mem[3]
4753 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,3,2,3]
4754 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[3,3,3,3]
4755 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
4756 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
4757 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
4758 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
4759 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
4760 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
4761 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
4762 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7]
4763 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4764 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
4765 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4766 ; AVX2-NEXT: vmovaps %ymm1, (%rsi)
4767 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4768 ; AVX2-NEXT: vmovaps %ymm1, 32(%rsi)
4769 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4770 ; AVX2-NEXT: vmovaps %ymm1, (%rdx)
4771 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4772 ; AVX2-NEXT: vmovaps %ymm1, 32(%rdx)
4773 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4774 ; AVX2-NEXT: vmovaps %ymm1, (%rcx)
4775 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4776 ; AVX2-NEXT: vmovaps %ymm1, 32(%rcx)
4777 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4778 ; AVX2-NEXT: vmovaps %ymm1, (%r8)
4779 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4780 ; AVX2-NEXT: vmovaps %ymm1, 32(%r8)
4781 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4782 ; AVX2-NEXT: vmovaps %ymm1, (%r9)
4783 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4784 ; AVX2-NEXT: vmovaps %ymm1, 32(%r9)
4785 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
4786 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4787 ; AVX2-NEXT: vmovaps %ymm1, (%rax)
4788 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4789 ; AVX2-NEXT: vmovaps %ymm1, 32(%rax)
4790 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
4791 ; AVX2-NEXT: vmovdqa %ymm8, (%rax)
4792 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
4793 ; AVX2-NEXT: vmovaps %ymm1, 32(%rax)
4794 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
4795 ; AVX2-NEXT: vmovdqa %ymm0, (%rax)
4796 ; AVX2-NEXT: vmovdqa %ymm2, 32(%rax)
4797 ; AVX2-NEXT: addq $1000, %rsp # imm = 0x3E8
4798 ; AVX2-NEXT: vzeroupper
4801 ; AVX2-FP-LABEL: load_i16_stride8_vf32:
4803 ; AVX2-FP-NEXT: subq $1000, %rsp # imm = 0x3E8
4804 ; AVX2-FP-NEXT: vmovdqa 448(%rdi), %ymm2
4805 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4806 ; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm3
4807 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4808 ; AVX2-FP-NEXT: vmovdqa 272(%rdi), %xmm0
4809 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4810 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm1
4811 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4812 ; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm4
4813 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4814 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm5
4815 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4816 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
4817 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4818 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4819 ; AVX2-FP-NEXT: vmovdqa 368(%rdi), %xmm0
4820 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4821 ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %xmm1
4822 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4823 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4824 ; AVX2-FP-NEXT: vpbroadcastd %xmm9, %xmm0
4825 ; AVX2-FP-NEXT: vmovdqa 336(%rdi), %xmm1
4826 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4827 ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm4
4828 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4829 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
4830 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4831 ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %xmm1
4832 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4833 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
4834 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
4835 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,0,2]
4836 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4837 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
4838 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4839 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
4840 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
4841 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4842 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
4843 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4844 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
4845 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
4846 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm2
4847 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4848 ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm3
4849 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4850 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2]
4851 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4852 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[0,2,2,3,4,6,6,7]
4853 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
4854 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
4855 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4856 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7]
4857 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
4858 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
4859 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
4860 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4861 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4862 ; AVX2-FP-NEXT: vmovdqa 112(%rdi), %xmm0
4863 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4864 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm1
4865 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4866 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
4867 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4868 ; AVX2-FP-NEXT: vpbroadcastd %xmm0, %xmm0
4869 ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm1
4870 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4871 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm2
4872 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4873 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
4874 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4875 ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %xmm1
4876 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4877 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1
4878 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4879 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm2
4880 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4881 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm3
4882 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4883 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm4
4884 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4885 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
4886 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
4887 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm15[0],xmm10[1],xmm15[1]
4888 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3]
4889 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm0
4890 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4891 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm1
4892 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4893 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
4894 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4895 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,2,0,4,5,6,4]
4896 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
4897 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
4898 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4899 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4]
4900 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
4901 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm11[7]
4902 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm2
4903 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4904 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm0
4905 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4906 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
4907 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4908 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
4909 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
4910 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
4911 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4912 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
4913 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
4914 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7]
4915 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
4916 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
4917 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4918 ; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4919 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
4920 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4921 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3]
4922 ; AVX2-FP-NEXT: vmovdqa %xmm9, %xmm14
4923 ; AVX2-FP-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill
4924 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
4925 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm14[0],xmm9[1],xmm14[1]
4926 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
4927 ; AVX2-FP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
4928 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
4929 ; AVX2-FP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
4930 ; AVX2-FP-NEXT: # ymm6 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
4931 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7]
4932 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
4933 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
4934 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
4935 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
4936 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4937 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4938 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
4939 ; AVX2-FP-NEXT: vmovdqa %xmm10, %xmm11
4940 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3]
4941 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4942 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
4943 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
4944 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
4945 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
4946 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
4947 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
4948 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
4949 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
4950 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
4951 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
4952 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4953 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4954 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
4955 ; AVX2-FP-NEXT: vmovdqa %xmm9, %xmm3
4956 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
4957 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm8[2],xmm5[2],xmm8[3],xmm5[3]
4958 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3]
4959 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
4960 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7]
4961 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
4962 ; AVX2-FP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
4963 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
4964 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
4965 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,6],ymm1[7]
4966 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
4967 ; AVX2-FP-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7]
4968 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
4969 ; AVX2-FP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
4970 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
4971 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
4972 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
4973 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
4974 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
4975 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4976 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[2,2,2,2]
4977 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1,2],xmm4[3]
4978 ; AVX2-FP-NEXT: vmovdqa %xmm11, %xmm12
4979 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm15[2],xmm11[3],xmm15[3]
4980 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
4981 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
4982 ; AVX2-FP-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7]
4983 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
4984 ; AVX2-FP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
4985 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
4986 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
4987 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
4988 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
4989 ; AVX2-FP-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7]
4990 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
4991 ; AVX2-FP-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
4992 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
4993 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
4994 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
4995 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7]
4996 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
4997 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4998 ; AVX2-FP-NEXT: vpunpckhdq (%rsp), %xmm3, %xmm4 # 16-byte Folded Reload
4999 ; AVX2-FP-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3]
5000 ; AVX2-FP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5001 ; AVX2-FP-NEXT: # xmm5 = mem[2,3,2,3]
5002 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
5003 ; AVX2-FP-NEXT: # xmm13 = mem[3,3,3,3]
5004 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0],xmm5[1],xmm13[2,3]
5005 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
5006 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
5007 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
5008 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
5009 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
5010 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
5011 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
5012 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5013 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
5014 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5015 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload
5016 ; AVX2-FP-NEXT: # xmm0 = xmm10[2],mem[2],xmm10[3],mem[3]
5017 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[2,3,2,3]
5018 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,3,3,3]
5019 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
5020 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
5021 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
5022 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
5023 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
5024 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
5025 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
5026 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
5027 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5028 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5029 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5030 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5031 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
5032 ; AVX2-FP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5033 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5034 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5035 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
5036 ; AVX2-FP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5037 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5038 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5039 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
5040 ; AVX2-FP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5041 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5042 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5043 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
5044 ; AVX2-FP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5045 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5046 ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %xmm0
5047 ; AVX2-FP-NEXT: vpbroadcastd %xmm4, %xmm1
5048 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5049 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5050 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
5051 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
5052 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3]
5053 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5054 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
5055 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3]
5056 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5057 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,2,0,4,5,6,4]
5058 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
5059 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4]
5060 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
5061 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
5062 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
5063 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3]
5064 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5065 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
5066 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,1,3]
5067 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5068 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,2,3,4,6,6,7]
5069 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
5070 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,2,3,4,6,6,7]
5071 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
5072 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
5073 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
5074 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5075 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5076 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5077 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
5078 ; AVX2-FP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5079 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5080 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5081 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
5082 ; AVX2-FP-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5083 ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %xmm0
5084 ; AVX2-FP-NEXT: vpbroadcastd %xmm11, %xmm1
5085 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5086 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5087 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
5088 ; AVX2-FP-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
5089 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5090 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5091 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload
5092 ; AVX2-FP-NEXT: # xmm14 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
5093 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm2[0],xmm14[1],xmm2[1]
5094 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3]
5095 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
5096 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,1,3]
5097 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5098 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
5099 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3]
5100 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5101 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4]
5102 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
5103 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,0,4,5,6,4]
5104 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
5105 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
5106 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
5107 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,1,3]
5108 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5109 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
5110 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3]
5111 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5112 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,2,2,3,4,6,6,7]
5113 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
5114 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
5115 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
5116 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
5117 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7]
5118 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
5119 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5120 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5121 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[1,1,1,1]
5122 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5123 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3]
5124 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5125 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5126 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm13[0],xmm2[1],xmm13[1]
5127 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
5128 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
5129 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
5130 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5,6],ymm5[7]
5131 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
5132 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
5133 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7]
5134 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
5135 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
5136 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5137 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[1,1,1,1]
5138 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5139 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3]
5140 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5141 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
5142 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
5143 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
5144 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
5145 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
5146 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
5147 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
5148 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
5149 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
5150 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5151 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5152 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[2,2,2,2]
5153 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
5154 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm15[2],xmm6[2],xmm15[3],xmm6[3]
5155 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
5156 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
5157 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
5158 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
5159 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
5160 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
5161 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
5162 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
5163 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
5164 ; AVX2-FP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
5165 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
5166 ; AVX2-FP-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7]
5167 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
5168 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm5[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
5169 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
5170 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
5171 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5172 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5173 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,2,2,2]
5174 ; AVX2-FP-NEXT: vmovdqa %xmm11, %xmm7
5175 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
5176 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm12[2],xmm14[3],xmm12[3]
5177 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm1[0,1],xmm0[2,3]
5178 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
5179 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7]
5180 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
5181 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
5182 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
5183 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
5184 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7]
5185 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
5186 ; AVX2-FP-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
5187 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
5188 ; AVX2-FP-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
5189 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
5190 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
5191 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
5192 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7]
5193 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
5194 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5195 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
5196 ; AVX2-FP-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3]
5197 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[2,3,2,3]
5198 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5199 ; AVX2-FP-NEXT: # xmm15 = mem[3,3,3,3]
5200 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2,3]
5201 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3]
5202 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
5203 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
5204 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
5205 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
5206 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
5207 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
5208 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
5209 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
5210 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload
5211 ; AVX2-FP-NEXT: # xmm3 = xmm7[2],mem[2],xmm7[3],mem[3]
5212 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,3,2,3]
5213 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[3,3,3,3]
5214 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
5215 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
5216 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
5217 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
5218 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
5219 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
5220 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
5221 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7]
5222 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5223 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
5224 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5225 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rsi)
5226 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5227 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rsi)
5228 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5229 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rdx)
5230 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5231 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rdx)
5232 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5233 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rcx)
5234 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5235 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rcx)
5236 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5237 ; AVX2-FP-NEXT: vmovaps %ymm1, (%r8)
5238 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5239 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r8)
5240 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5241 ; AVX2-FP-NEXT: vmovaps %ymm1, (%r9)
5242 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5243 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%r9)
5244 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5245 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5246 ; AVX2-FP-NEXT: vmovaps %ymm1, (%rax)
5247 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5248 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rax)
5249 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5250 ; AVX2-FP-NEXT: vmovdqa %ymm8, (%rax)
5251 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5252 ; AVX2-FP-NEXT: vmovaps %ymm1, 32(%rax)
5253 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5254 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rax)
5255 ; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%rax)
5256 ; AVX2-FP-NEXT: addq $1000, %rsp # imm = 0x3E8
5257 ; AVX2-FP-NEXT: vzeroupper
5258 ; AVX2-FP-NEXT: retq
5260 ; AVX2-FCP-LABEL: load_i16_stride8_vf32:
5261 ; AVX2-FCP: # %bb.0:
5262 ; AVX2-FCP-NEXT: subq $1000, %rsp # imm = 0x3E8
5263 ; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm2
5264 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5265 ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm3
5266 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5267 ; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm0
5268 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5269 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm1
5270 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5271 ; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm4
5272 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5273 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm5
5274 ; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5275 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
5276 ; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5277 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
5278 ; AVX2-FCP-NEXT: vmovdqa 368(%rdi), %xmm0
5279 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5280 ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %xmm1
5281 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5282 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
5283 ; AVX2-FCP-NEXT: vpbroadcastd %xmm9, %xmm0
5284 ; AVX2-FCP-NEXT: vmovdqa 336(%rdi), %xmm1
5285 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5286 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm4
5287 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5288 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
5289 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5290 ; AVX2-FCP-NEXT: vpbroadcastd %xmm1, %xmm1
5291 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5292 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
5293 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
5294 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,0,2]
5295 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5296 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
5297 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5298 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
5299 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
5300 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5301 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
5302 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5303 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
5304 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
5305 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm2
5306 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5307 ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm3
5308 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5309 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2]
5310 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5311 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm3[0,2,2,3,4,6,6,7]
5312 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm13[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
5313 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
5314 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5315 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7]
5316 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
5317 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
5318 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
5319 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5320 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5321 ; AVX2-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
5322 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5323 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
5324 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5325 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
5326 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5327 ; AVX2-FCP-NEXT: vpbroadcastd %xmm0, %xmm0
5328 ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm1
5329 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5330 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
5331 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5332 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
5333 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5334 ; AVX2-FCP-NEXT: vpbroadcastd %xmm1, %xmm1
5335 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5336 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1
5337 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5338 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
5339 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5340 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
5341 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5342 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm4
5343 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5344 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
5345 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
5346 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm15[0],xmm10[1],xmm15[1]
5347 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3]
5348 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm0
5349 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5350 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm1
5351 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5352 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
5353 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5354 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,2,0,4,5,6,4]
5355 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
5356 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
5357 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5358 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4]
5359 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
5360 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm11[7]
5361 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm2
5362 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5363 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm0
5364 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5365 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
5366 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5367 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
5368 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
5369 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
5370 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5371 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
5372 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
5373 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3,4],ymm0[5],ymm14[6,7]
5374 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
5375 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
5376 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5377 ; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5378 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
5379 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
5380 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm5[1],xmm0[2,3]
5381 ; AVX2-FCP-NEXT: vmovdqa %xmm9, %xmm14
5382 ; AVX2-FCP-NEXT: vmovdqa %xmm9, (%rsp) # 16-byte Spill
5383 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5384 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm14[0],xmm9[1],xmm14[1]
5385 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5386 ; AVX2-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
5387 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
5388 ; AVX2-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
5389 ; AVX2-FCP-NEXT: # ymm6 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
5390 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,6],ymm1[7]
5391 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm13[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
5392 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
5393 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
5394 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
5395 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5396 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5397 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
5398 ; AVX2-FCP-NEXT: vmovdqa %xmm10, %xmm11
5399 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm15[1],xmm0[2,3]
5400 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5401 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5402 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1]
5403 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5404 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
5405 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
5406 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
5407 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
5408 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
5409 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
5410 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5411 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5412 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5413 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
5414 ; AVX2-FCP-NEXT: vmovdqa %xmm9, %xmm3
5415 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
5416 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm8[2],xmm5[2],xmm8[3],xmm5[3]
5417 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3]
5418 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
5419 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7]
5420 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
5421 ; AVX2-FCP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
5422 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
5423 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
5424 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,6],ymm1[7]
5425 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
5426 ; AVX2-FCP-NEXT: # ymm1 = mem[3,1,2,3,7,5,6,7]
5427 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
5428 ; AVX2-FCP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
5429 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
5430 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
5431 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
5432 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5],ymm5[6,7]
5433 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
5434 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5435 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm13[2,2,2,2]
5436 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm10[0,1,2],xmm4[3]
5437 ; AVX2-FCP-NEXT: vmovdqa %xmm11, %xmm12
5438 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm11[2],xmm15[2],xmm11[3],xmm15[3]
5439 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
5440 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
5441 ; AVX2-FCP-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7]
5442 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
5443 ; AVX2-FCP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
5444 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
5445 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
5446 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
5447 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
5448 ; AVX2-FCP-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7]
5449 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
5450 ; AVX2-FCP-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
5451 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
5452 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
5453 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
5454 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm13[0,1,2,3,4,5],ymm5[6,7]
5455 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
5456 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5457 ; AVX2-FCP-NEXT: vpunpckhdq (%rsp), %xmm3, %xmm4 # 16-byte Folded Reload
5458 ; AVX2-FCP-NEXT: # xmm4 = xmm3[2],mem[2],xmm3[3],mem[3]
5459 ; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5460 ; AVX2-FCP-NEXT: # xmm5 = mem[2,3,2,3]
5461 ; AVX2-FCP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
5462 ; AVX2-FCP-NEXT: # xmm13 = mem[3,3,3,3]
5463 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm13[0],xmm5[1],xmm13[2,3]
5464 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3]
5465 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
5466 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
5467 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
5468 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
5469 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
5470 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
5471 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5472 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
5473 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5474 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm0 # 16-byte Folded Reload
5475 ; AVX2-FCP-NEXT: # xmm0 = xmm10[2],mem[2],xmm10[3],mem[3]
5476 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[2,3,2,3]
5477 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,3,3,3]
5478 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
5479 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
5480 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
5481 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
5482 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
5483 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
5484 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
5485 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
5486 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5487 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5488 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5489 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5490 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
5491 ; AVX2-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5492 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5493 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5494 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
5495 ; AVX2-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5496 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5497 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5498 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
5499 ; AVX2-FCP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5500 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5501 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5502 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
5503 ; AVX2-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5504 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5505 ; AVX2-FCP-NEXT: vpbroadcastd %xmm1, %xmm0
5506 ; AVX2-FCP-NEXT: vpbroadcastd %xmm4, %xmm1
5507 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5508 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5509 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
5510 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
5511 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3]
5512 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5513 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
5514 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3]
5515 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5516 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm1[0,1,2,0,4,5,6,4]
5517 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
5518 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4]
5519 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
5520 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
5521 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
5522 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3]
5523 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5524 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
5525 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,1,3]
5526 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5527 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,2,3,4,6,6,7]
5528 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
5529 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,2,3,4,6,6,7]
5530 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
5531 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
5532 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
5533 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5534 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5535 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5536 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
5537 ; AVX2-FCP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5538 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5539 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5540 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
5541 ; AVX2-FCP-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5542 ; AVX2-FCP-NEXT: vpbroadcastd %xmm1, %xmm0
5543 ; AVX2-FCP-NEXT: vpbroadcastd %xmm11, %xmm1
5544 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5545 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5546 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
5547 ; AVX2-FCP-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
5548 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5549 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5550 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm14 # 16-byte Folded Reload
5551 ; AVX2-FCP-NEXT: # xmm14 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
5552 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm2[0],xmm14[1],xmm2[1]
5553 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3]
5554 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
5555 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,1,3]
5556 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5557 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
5558 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3]
5559 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5560 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm0[0,1,2,0,4,5,6,4]
5561 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
5562 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm1[0,1,2,0,4,5,6,4]
5563 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
5564 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm15[7]
5565 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
5566 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,1,3]
5567 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5568 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
5569 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3]
5570 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5571 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm0[0,2,2,3,4,6,6,7]
5572 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
5573 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
5574 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
5575 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
5576 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7]
5577 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
5578 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5579 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
5580 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[1,1,1,1]
5581 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
5582 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm6[1],xmm0[2,3]
5583 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
5584 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
5585 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm2[0],xmm13[0],xmm2[1],xmm13[1]
5586 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
5587 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
5588 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
5589 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5,6],ymm5[7]
5590 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
5591 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
5592 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7]
5593 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
5594 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
5595 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5596 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[1,1,1,1]
5597 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
5598 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3]
5599 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
5600 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
5601 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3]
5602 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
5603 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
5604 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
5605 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
5606 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
5607 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
5608 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
5609 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5610 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5611 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[2,2,2,2]
5612 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[3]
5613 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm15[2],xmm6[2],xmm15[3],xmm6[3]
5614 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
5615 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
5616 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
5617 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
5618 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
5619 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
5620 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
5621 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
5622 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
5623 ; AVX2-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
5624 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
5625 ; AVX2-FCP-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7]
5626 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
5627 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm5[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
5628 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
5629 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
5630 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5631 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5632 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,2,2,2]
5633 ; AVX2-FCP-NEXT: vmovdqa %xmm11, %xmm7
5634 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
5635 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm12[2],xmm14[3],xmm12[3]
5636 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm1[0,1],xmm0[2,3]
5637 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
5638 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7]
5639 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
5640 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
5641 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
5642 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
5643 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4,5,6],ymm9[7]
5644 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
5645 ; AVX2-FCP-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
5646 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
5647 ; AVX2-FCP-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
5648 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
5649 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
5650 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
5651 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4,5],ymm9[6,7]
5652 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
5653 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
5654 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
5655 ; AVX2-FCP-NEXT: # xmm9 = xmm9[2],mem[2],xmm9[3],mem[3]
5656 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm6[2,3,2,3]
5657 ; AVX2-FCP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5658 ; AVX2-FCP-NEXT: # xmm15 = mem[3,3,3,3]
5659 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm15[0],xmm13[1],xmm15[2,3]
5660 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm13[0,1],xmm9[2,3]
5661 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
5662 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
5663 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
5664 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
5665 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
5666 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
5667 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
5668 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3],ymm2[4,5,6,7]
5669 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm3 # 16-byte Folded Reload
5670 ; AVX2-FCP-NEXT: # xmm3 = xmm7[2],mem[2],xmm7[3],mem[3]
5671 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm12[2,3,2,3]
5672 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm14[3,3,3,3]
5673 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
5674 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
5675 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
5676 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
5677 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
5678 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
5679 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
5680 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4],ymm1[5],ymm4[6,7]
5681 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5682 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
5683 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5684 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rsi)
5685 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5686 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rsi)
5687 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5688 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rdx)
5689 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5690 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rdx)
5691 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5692 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rcx)
5693 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5694 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rcx)
5695 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5696 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%r8)
5697 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5698 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r8)
5699 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5700 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%r9)
5701 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5702 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%r9)
5703 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5704 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5705 ; AVX2-FCP-NEXT: vmovaps %ymm1, (%rax)
5706 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5707 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rax)
5708 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5709 ; AVX2-FCP-NEXT: vmovdqa %ymm8, (%rax)
5710 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5711 ; AVX2-FCP-NEXT: vmovaps %ymm1, 32(%rax)
5712 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
5713 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rax)
5714 ; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%rax)
5715 ; AVX2-FCP-NEXT: addq $1000, %rsp # imm = 0x3E8
5716 ; AVX2-FCP-NEXT: vzeroupper
5717 ; AVX2-FCP-NEXT: retq
5719 ; AVX512-LABEL: load_i16_stride8_vf32:
5721 ; AVX512-NEXT: subq $616, %rsp # imm = 0x268
5722 ; AVX512-NEXT: vmovdqa 368(%rdi), %xmm0
5723 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5724 ; AVX512-NEXT: vmovdqa 352(%rdi), %xmm1
5725 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5726 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
5727 ; AVX512-NEXT: vmovdqa 336(%rdi), %xmm0
5728 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5729 ; AVX512-NEXT: vmovdqa 320(%rdi), %xmm1
5730 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5731 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
5732 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
5733 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm27 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
5734 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
5735 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
5736 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm26
5737 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm25 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
5738 ; AVX512-NEXT: vmovdqa %xmm5, %xmm0
5739 ; AVX512-NEXT: vpermt2d %xmm3, %xmm1, %xmm0
5740 ; AVX512-NEXT: vmovdqa 304(%rdi), %xmm2
5741 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5742 ; AVX512-NEXT: vmovdqa 288(%rdi), %xmm3
5743 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5744 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5745 ; AVX512-NEXT: vmovdqa 272(%rdi), %xmm2
5746 ; AVX512-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
5747 ; AVX512-NEXT: vmovdqa 256(%rdi), %xmm3
5748 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5749 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
5750 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm11[0],xmm7[1],xmm11[1]
5751 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
5752 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm16
5753 ; AVX512-NEXT: vmovdqa 480(%rdi), %ymm0
5754 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5755 ; AVX512-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2]
5756 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,1,2,0,4,5,6,4]
5757 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
5758 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm21
5759 ; AVX512-NEXT: vmovdqa 448(%rdi), %ymm2
5760 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5761 ; AVX512-NEXT: vpermq {{.*#+}} ymm20 = ymm2[0,1,0,2]
5762 ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm20[0,1,2,0,4,5,6,4]
5763 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
5764 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
5765 ; AVX512-NEXT: vmovdqa 416(%rdi), %ymm2
5766 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5767 ; AVX512-NEXT: vpermq {{.*#+}} ymm22 = ymm2[0,1,0,2]
5768 ; AVX512-NEXT: vmovdqa 384(%rdi), %ymm2
5769 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5770 ; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm2[0,1,0,2]
5771 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm22[0,2,2,3,4,6,6,7]
5772 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
5773 ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[0,2,2,3,4,6,6,7]
5774 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
5775 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5],ymm8[6,7]
5776 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
5777 ; AVX512-NEXT: movb $-64, %al
5778 ; AVX512-NEXT: kmovw %eax, %k1
5779 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 {%k1}
5780 ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm2
5781 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5782 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm0
5783 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5784 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
5785 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm3
5786 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5787 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm2
5788 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5789 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
5790 ; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,2,2]
5791 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm8[3]
5792 ; AVX512-NEXT: vmovdqa64 %xmm3, %xmm23
5793 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm19 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
5794 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm24 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
5795 ; AVX512-NEXT: vpermt2d %xmm0, %xmm1, %xmm2
5796 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
5797 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5798 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
5799 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5800 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm3
5801 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5802 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm4
5803 ; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5804 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
5805 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
5806 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
5807 ; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm15[0,1],xmm2[2,3]
5808 ; AVX512-NEXT: vmovdqa 224(%rdi), %ymm0
5809 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5810 ; AVX512-NEXT: vpermq {{.*#+}} ymm31 = ymm0[0,1,0,2]
5811 ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm0
5812 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5813 ; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm0[0,1,0,2]
5814 ; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm31[0,1,2,0,4,5,6,4]
5815 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
5816 ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm30[0,1,2,0,4,5,6,4]
5817 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm14 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
5818 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm0[7]
5819 ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm0
5820 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5821 ; AVX512-NEXT: vpermq {{.*#+}} ymm14 = ymm0[0,1,0,2]
5822 ; AVX512-NEXT: vmovdqa64 128(%rdi), %ymm28
5823 ; AVX512-NEXT: vpermq {{.*#+}} ymm17 = ymm28[0,1,0,2]
5824 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,2,2,3,4,6,6,7]
5825 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
5826 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[0,2,2,3,4,6,6,7]
5827 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
5828 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
5829 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
5830 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
5831 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0
5832 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5833 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
5834 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3]
5835 ; AVX512-NEXT: vmovdqa64 %xmm27, %xmm4
5836 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
5837 ; AVX512-NEXT: vmovdqa64 %ymm21, %ymm4
5838 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
5839 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
5840 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
5841 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
5842 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
5843 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7]
5844 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
5845 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
5846 ; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 {%k1}
5847 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
5848 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
5849 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
5850 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
5851 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
5852 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
5853 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
5854 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1]
5855 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3]
5856 ; AVX512-NEXT: vmovdqa64 %xmm19, %xmm4
5857 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
5858 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5859 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
5860 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5861 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm11[2],xmm7[3],xmm11[3]
5862 ; AVX512-NEXT: vmovdqa64 %xmm26, %xmm1
5863 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5864 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
5865 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[0,1,1,3,4,5,5,7]
5866 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
5867 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[0,1,1,3,4,5,5,7]
5868 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
5869 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7]
5870 ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm22[3,1,2,3,7,5,6,7]
5871 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
5872 ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[3,1,2,3,7,5,6,7]
5873 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm10 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
5874 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
5875 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
5876 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
5877 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[0,1,1,3,4,5,5,7]
5878 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
5879 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm30[0,1,1,3,4,5,5,7]
5880 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm12 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
5881 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5,6],ymm6[7]
5882 ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[3,1,2,3,7,5,6,7]
5883 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
5884 ; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm17[3,1,2,3,7,5,6,7]
5885 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
5886 ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
5887 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7]
5888 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
5889 ; AVX512-NEXT: vmovdqa64 %xmm23, %xmm15
5890 ; AVX512-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3]
5891 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7]
5892 ; AVX512-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0
5893 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5894 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,7,0,0]
5895 ; AVX512-NEXT: vpermt2d %xmm11, %xmm6, %xmm7
5896 ; AVX512-NEXT: vmovdqa64 %xmm25, %xmm0
5897 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
5898 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
5899 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
5900 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
5901 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
5902 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
5903 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
5904 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
5905 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
5906 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
5907 ; AVX512-NEXT: vpermt2d %xmm8, %xmm6, %xmm3
5908 ; AVX512-NEXT: vmovdqa64 %xmm24, %xmm1
5909 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
5910 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
5911 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
5912 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
5913 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
5914 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
5915 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
5916 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
5917 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
5918 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
5919 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5920 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5921 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
5922 ; AVX512-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5923 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5924 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
5925 ; AVX512-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
5926 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
5927 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
5928 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm30
5929 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5930 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm31 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
5931 ; AVX512-NEXT: vmovdqa %xmm1, %xmm2
5932 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
5933 ; AVX512-NEXT: vpermt2d %xmm0, %xmm1, %xmm2
5934 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5935 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
5936 ; AVX512-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5937 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5938 ; AVX512-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm4 # 16-byte Folded Reload
5939 ; AVX512-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5940 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
5941 ; AVX512-NEXT: vmovdqa64 %xmm5, %xmm20
5942 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
5943 ; AVX512-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm12
5944 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
5945 ; AVX512-NEXT: # ymm19 = mem[0,1,1,3]
5946 ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[0,1,2,0,4,5,6,4]
5947 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
5948 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
5949 ; AVX512-NEXT: # ymm21 = mem[0,1,1,3]
5950 ; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm21[0,1,2,0,4,5,6,4]
5951 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
5952 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
5953 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
5954 ; AVX512-NEXT: # ymm29 = mem[0,1,1,3]
5955 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
5956 ; AVX512-NEXT: # ymm23 = mem[0,1,1,3]
5957 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[0,2,2,3,4,6,6,7]
5958 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
5959 ; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[0,2,2,3,4,6,6,7]
5960 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
5961 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
5962 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
5963 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm12 {%k1}
5964 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5965 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
5966 ; AVX512-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5967 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
5968 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
5969 ; AVX512-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
5970 ; AVX512-NEXT: vmovdqa %xmm1, %xmm0
5971 ; AVX512-NEXT: vpermi2d %xmm2, %xmm3, %xmm0
5972 ; AVX512-NEXT: vmovdqa64 %xmm3, %xmm16
5973 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm18
5974 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5975 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload
5976 ; AVX512-NEXT: # xmm8 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
5977 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
5978 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
5979 ; AVX512-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
5980 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
5981 ; AVX512-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,3]
5982 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
5983 ; AVX512-NEXT: # ymm17 = mem[0,1,1,3]
5984 ; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm17[0,1,2,0,4,5,6,4]
5985 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
5986 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
5987 ; AVX512-NEXT: # ymm24 = mem[0,1,1,3]
5988 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm24[0,1,2,0,4,5,6,4]
5989 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
5990 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm0[7]
5991 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
5992 ; AVX512-NEXT: # ymm25 = mem[0,1,1,3]
5993 ; AVX512-NEXT: vpermq {{.*#+}} ymm26 = ymm28[0,1,1,3]
5994 ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm25[0,2,2,3,4,6,6,7]
5995 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
5996 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[0,2,2,3,4,6,6,7]
5997 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
5998 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
5999 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
6000 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
6001 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm27
6002 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
6003 ; AVX512-NEXT: vmovdqa64 %xmm20, %xmm12
6004 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3]
6005 ; AVX512-NEXT: vmovdqa64 %xmm22, %xmm2
6006 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
6007 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
6008 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
6009 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
6010 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6011 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
6012 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
6013 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
6014 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
6015 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
6016 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
6017 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
6018 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
6019 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6020 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
6021 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
6022 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
6023 ; AVX512-NEXT: vmovdqa64 %xmm16, %xmm9
6024 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm16[0],xmm18[0],xmm16[1],xmm18[1]
6025 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,1,1,1]
6026 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3]
6027 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
6028 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6029 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20
6030 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm4[2],xmm12[2],xmm4[3],xmm12[3]
6031 ; AVX512-NEXT: vmovdqa64 %xmm12, %xmm16
6032 ; AVX512-NEXT: vmovdqa64 %xmm30, %xmm1
6033 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6034 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm1
6035 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,1,1,3,4,5,5,7]
6036 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
6037 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[0,1,1,3,4,5,5,7]
6038 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
6039 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6],ymm0[7]
6040 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[3,1,2,3,7,5,6,7]
6041 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
6042 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm23[3,1,2,3,7,5,6,7]
6043 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
6044 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4],ymm7[5],ymm11[6,7]
6045 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
6046 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
6047 ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,1,1,3,4,5,5,7]
6048 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
6049 ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm24[0,1,1,3,4,5,5,7]
6050 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm13 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
6051 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5,6],ymm11[7]
6052 ; AVX512-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[3,1,2,3,7,5,6,7]
6053 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
6054 ; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm26[3,1,2,3,7,5,6,7]
6055 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
6056 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
6057 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
6058 ; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm18[2,2,2,2]
6059 ; AVX512-NEXT: vpblendd {{.*#+}} xmm12 = xmm9[0,1,2],xmm12[3]
6060 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm5[2],xmm8[2],xmm5[3],xmm8[3]
6061 ; AVX512-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
6062 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
6063 ; AVX512-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1
6064 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm12 = [3,7,0,0]
6065 ; AVX512-NEXT: vpermt2d %xmm16, %xmm12, %xmm4
6066 ; AVX512-NEXT: vmovdqa64 %xmm31, %xmm11
6067 ; AVX512-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3]
6068 ; AVX512-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4
6069 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
6070 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
6071 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
6072 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
6073 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
6074 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
6075 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
6076 ; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm4 {%k1}
6077 ; AVX512-NEXT: vpermt2d %xmm8, %xmm12, %xmm5
6078 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm18[2],xmm9[3],xmm18[3]
6079 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
6080 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
6081 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
6082 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
6083 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
6084 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
6085 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
6086 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
6087 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6088 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
6089 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6090 ; AVX512-NEXT: vmovaps %zmm2, (%rsi)
6091 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6092 ; AVX512-NEXT: vmovaps %zmm2, (%rdx)
6093 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6094 ; AVX512-NEXT: vmovaps %zmm2, (%rcx)
6095 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6096 ; AVX512-NEXT: vmovaps %zmm2, (%r8)
6097 ; AVX512-NEXT: vmovdqa64 %zmm27, (%r9)
6098 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
6099 ; AVX512-NEXT: vmovdqa64 %zmm20, (%rax)
6100 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
6101 ; AVX512-NEXT: vmovdqa64 %zmm1, (%rax)
6102 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
6103 ; AVX512-NEXT: vmovdqa64 %zmm0, (%rax)
6104 ; AVX512-NEXT: addq $616, %rsp # imm = 0x268
6105 ; AVX512-NEXT: vzeroupper
6108 ; AVX512-FCP-LABEL: load_i16_stride8_vf32:
6109 ; AVX512-FCP: # %bb.0:
6110 ; AVX512-FCP-NEXT: subq $552, %rsp # imm = 0x228
6111 ; AVX512-FCP-NEXT: vmovdqa 368(%rdi), %xmm0
6112 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6113 ; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %xmm1
6114 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6115 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6116 ; AVX512-FCP-NEXT: vmovdqa 336(%rdi), %xmm0
6117 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6118 ; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %xmm1
6119 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6120 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6121 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
6122 ; AVX512-FCP-NEXT: vmovdqa %xmm3, %xmm0
6123 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm16
6124 ; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm1, %xmm0
6125 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm22
6126 ; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm2
6127 ; AVX512-FCP-NEXT: vmovdqa 304(%rdi), %xmm1
6128 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6129 ; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %xmm3
6130 ; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6131 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
6132 ; AVX512-FCP-NEXT: vmovdqa 272(%rdi), %xmm1
6133 ; AVX512-FCP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
6134 ; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %xmm3
6135 ; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6136 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
6137 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
6138 ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm20
6139 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
6140 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm11
6141 ; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %ymm0
6142 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6143 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm21 = ymm0[0,1,0,2]
6144 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm21[0,1,2,0,4,5,6,4]
6145 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
6146 ; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm1
6147 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6148 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm27 = ymm1[0,1,0,2]
6149 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm27[0,1,2,0,4,5,6,4]
6150 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
6151 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
6152 ; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm1
6153 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6154 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm28 = ymm1[0,1,0,2]
6155 ; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm1
6156 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6157 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2]
6158 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm28[0,2,2,3,4,6,6,7]
6159 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
6160 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[0,2,2,3,4,6,6,7]
6161 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6162 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
6163 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
6164 ; AVX512-FCP-NEXT: movb $-64, %al
6165 ; AVX512-FCP-NEXT: kmovw %eax, %k1
6166 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1}
6167 ; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
6168 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6169 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
6170 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6171 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6172 ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm0
6173 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6174 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
6175 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6176 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6177 ; AVX512-FCP-NEXT: vmovdqa %xmm13, %xmm0
6178 ; AVX512-FCP-NEXT: vpermt2d %xmm3, %xmm2, %xmm0
6179 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm25
6180 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1
6181 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6182 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
6183 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6184 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
6185 ; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6186 ; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm4
6187 ; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6188 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
6189 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
6190 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
6191 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
6192 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3]
6193 ; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm0
6194 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6195 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm30 = ymm0[0,1,0,2]
6196 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm23
6197 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm31 = ymm23[0,1,0,2]
6198 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,2,0,4,5,6,4]
6199 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
6200 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm31[0,1,2,0,4,5,6,4]
6201 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
6202 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm0[7]
6203 ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm0
6204 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6205 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,1,0,2]
6206 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm0
6207 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6208 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2]
6209 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,2,2,3,4,6,6,7]
6210 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
6211 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,2,2,3,4,6,6,7]
6212 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6213 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
6214 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6215 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
6216 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0
6217 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6218 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [1,5,0,0]
6219 ; AVX512-FCP-NEXT: vmovdqa %xmm9, %xmm0
6220 ; AVX512-FCP-NEXT: vpermt2d %xmm20, %xmm11, %xmm0
6221 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm16[0],xmm22[0],xmm16[1],xmm22[1]
6222 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6223 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
6224 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
6225 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
6226 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6227 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
6228 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6,7]
6229 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
6230 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
6231 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
6232 ; AVX512-FCP-NEXT: vmovdqa %xmm6, %xmm1
6233 ; AVX512-FCP-NEXT: vpermt2d %xmm17, %xmm11, %xmm1
6234 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm25[0],xmm13[1],xmm25[1]
6235 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3]
6236 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
6237 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
6238 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
6239 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6240 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
6241 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
6242 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
6243 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
6244 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
6245 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6246 ; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm2
6247 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3]
6248 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6]
6249 ; AVX512-FCP-NEXT: vpermt2d %xmm22, %xmm0, %xmm2
6250 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm20[2],xmm9[3],xmm20[3]
6251 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
6252 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm3
6253 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[0,1,1,3,4,5,5,7]
6254 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
6255 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm27[0,1,1,3,4,5,5,7]
6256 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
6257 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm5[7]
6258 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm28[3,1,2,3,7,5,6,7]
6259 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
6260 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm29[3,1,2,3,7,5,6,7]
6261 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
6262 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7]
6263 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
6264 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm3 {%k1}
6265 ; AVX512-FCP-NEXT: vmovdqa %xmm13, %xmm7
6266 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm13[2],xmm25[2],xmm13[3],xmm25[3]
6267 ; AVX512-FCP-NEXT: vpermt2d %xmm25, %xmm0, %xmm7
6268 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm6[2],xmm17[2],xmm6[3],xmm17[3]
6269 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
6270 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[0,1,1,3,4,5,5,7]
6271 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
6272 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[0,1,1,3,4,5,5,7]
6273 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
6274 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
6275 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[3,1,2,3,7,5,6,7]
6276 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
6277 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,1,2,3,7,5,6,7]
6278 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
6279 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7]
6280 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7]
6281 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
6282 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
6283 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6284 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm27 = [3,7,0,0]
6285 ; AVX512-FCP-NEXT: vpermt2d %xmm20, %xmm27, %xmm9
6286 ; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm0
6287 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
6288 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
6289 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
6290 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
6291 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
6292 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
6293 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
6294 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
6295 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
6296 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
6297 ; AVX512-FCP-NEXT: vpermt2d %xmm17, %xmm27, %xmm6
6298 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm10[2,3]
6299 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
6300 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
6301 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
6302 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
6303 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
6304 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
6305 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
6306 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6307 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28
6308 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6309 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
6310 ; AVX512-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
6311 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6312 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
6313 ; AVX512-FCP-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
6314 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6315 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
6316 ; AVX512-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
6317 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6318 ; AVX512-FCP-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm5 # 16-byte Folded Reload
6319 ; AVX512-FCP-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
6320 ; AVX512-FCP-NEXT: vmovdqa %xmm12, %xmm1
6321 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4]
6322 ; AVX512-FCP-NEXT: vpermt2d %xmm3, %xmm0, %xmm1
6323 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm24
6324 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
6325 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm18
6326 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
6327 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm31
6328 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
6329 ; AVX512-FCP-NEXT: # ymm29 = mem[0,1,1,3]
6330 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[0,1,2,0,4,5,6,4]
6331 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
6332 ; AVX512-FCP-NEXT: # ymm26 = mem[0,1,1,3]
6333 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
6334 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,1,2,0,4,5,6,4]
6335 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
6336 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
6337 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
6338 ; AVX512-FCP-NEXT: # ymm30 = mem[0,1,1,3]
6339 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
6340 ; AVX512-FCP-NEXT: # ymm25 = mem[0,1,1,3]
6341 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm30[0,2,2,3,4,6,6,7]
6342 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
6343 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[0,2,2,3,4,6,6,7]
6344 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6345 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5],ymm7[6,7]
6346 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
6347 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm31 {%k1}
6348 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6349 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
6350 ; AVX512-FCP-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
6351 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6352 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
6353 ; AVX512-FCP-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
6354 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6355 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
6356 ; AVX512-FCP-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
6357 ; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm0
6358 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
6359 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm22
6360 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6361 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload
6362 ; AVX512-FCP-NEXT: # xmm7 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
6363 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
6364 ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm16
6365 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm10[0,1],xmm0[2,3]
6366 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload
6367 ; AVX512-FCP-NEXT: # ymm20 = mem[0,1,1,3]
6368 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm20[0,1,2,0,4,5,6,4]
6369 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,1,1,3]
6370 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
6371 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,1,2,0,4,5,6,4]
6372 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
6373 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
6374 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
6375 ; AVX512-FCP-NEXT: # ymm19 = mem[0,1,1,3]
6376 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
6377 ; AVX512-FCP-NEXT: # ymm21 = mem[0,1,1,3]
6378 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[0,2,2,3,4,6,6,7]
6379 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
6380 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[0,2,2,3,4,6,6,7]
6381 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6382 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
6383 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
6384 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
6385 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm31, %zmm31
6386 ; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm0
6387 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,0,0]
6388 ; AVX512-FCP-NEXT: vpermt2d %xmm18, %xmm4, %xmm0
6389 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm24[0],xmm12[1],xmm24[1]
6390 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6391 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
6392 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
6393 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
6394 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6395 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
6396 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
6397 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
6398 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
6399 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
6400 ; AVX512-FCP-NEXT: vmovdqa %xmm4, %xmm3
6401 ; AVX512-FCP-NEXT: vpermi2d %xmm16, %xmm7, %xmm3
6402 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm22[0],xmm17[1],xmm22[1]
6403 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm3[0,1],xmm1[2,3]
6404 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
6405 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm14[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
6406 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
6407 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6408 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
6409 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
6410 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm1[6,7]
6411 ; AVX512-FCP-NEXT: vmovdqa %xmm12, %xmm8
6412 ; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm2
6413 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm12[2],xmm24[2],xmm12[3],xmm24[3]
6414 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7]
6415 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24
6416 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6]
6417 ; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm0, %xmm8
6418 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm18[2],xmm5[3],xmm18[3]
6419 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3]
6420 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
6421 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[0,1,1,3,4,5,5,7]
6422 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
6423 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,1,1,3,4,5,5,7]
6424 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
6425 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7]
6426 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm30[3,1,2,3,7,5,6,7]
6427 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
6428 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm25[3,1,2,3,7,5,6,7]
6429 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
6430 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5],ymm12[6,7]
6431 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7]
6432 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1}
6433 ; AVX512-FCP-NEXT: vpermi2d %xmm22, %xmm17, %xmm0
6434 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm7[2],xmm16[2],xmm7[3],xmm16[3]
6435 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3]
6436 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm20[0,1,1,3,4,5,5,7]
6437 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
6438 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[0,1,1,3,4,5,5,7]
6439 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
6440 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7]
6441 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[3,1,2,3,7,5,6,7]
6442 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
6443 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm21[3,1,2,3,7,5,6,7]
6444 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
6445 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm15[5],ymm4[6,7]
6446 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7]
6447 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
6448 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm1
6449 ; AVX512-FCP-NEXT: vpermt2d %xmm18, %xmm27, %xmm5
6450 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm10[2,3]
6451 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3
6452 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
6453 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
6454 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
6455 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
6456 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
6457 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
6458 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
6459 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1}
6460 ; AVX512-FCP-NEXT: vpermt2d %xmm16, %xmm27, %xmm7
6461 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm22[2],xmm17[3],xmm22[3]
6462 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3]
6463 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
6464 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
6465 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
6466 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
6467 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
6468 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
6469 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
6470 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6471 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
6472 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6473 ; AVX512-FCP-NEXT: vmovaps %zmm2, (%rsi)
6474 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6475 ; AVX512-FCP-NEXT: vmovaps %zmm2, (%rdx)
6476 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6477 ; AVX512-FCP-NEXT: vmovaps %zmm2, (%rcx)
6478 ; AVX512-FCP-NEXT: vmovdqa64 %zmm28, (%r8)
6479 ; AVX512-FCP-NEXT: vmovdqa64 %zmm31, (%r9)
6480 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6481 ; AVX512-FCP-NEXT: vmovdqa64 %zmm24, (%rax)
6482 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6483 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
6484 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
6485 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
6486 ; AVX512-FCP-NEXT: addq $552, %rsp # imm = 0x228
6487 ; AVX512-FCP-NEXT: vzeroupper
6488 ; AVX512-FCP-NEXT: retq
6490 ; AVX512DQ-LABEL: load_i16_stride8_vf32:
6491 ; AVX512DQ: # %bb.0:
6492 ; AVX512DQ-NEXT: subq $616, %rsp # imm = 0x268
6493 ; AVX512DQ-NEXT: vmovdqa 368(%rdi), %xmm0
6494 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6495 ; AVX512DQ-NEXT: vmovdqa 352(%rdi), %xmm1
6496 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6497 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6498 ; AVX512DQ-NEXT: vmovdqa 336(%rdi), %xmm0
6499 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6500 ; AVX512DQ-NEXT: vmovdqa 320(%rdi), %xmm1
6501 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6502 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6503 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
6504 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm27 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
6505 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
6506 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
6507 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm26
6508 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm25 = xmm5[2],xmm3[2],xmm5[3],xmm3[3]
6509 ; AVX512DQ-NEXT: vmovdqa %xmm5, %xmm0
6510 ; AVX512DQ-NEXT: vpermt2d %xmm3, %xmm1, %xmm0
6511 ; AVX512DQ-NEXT: vmovdqa 304(%rdi), %xmm2
6512 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6513 ; AVX512DQ-NEXT: vmovdqa 288(%rdi), %xmm3
6514 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6515 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
6516 ; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm2
6517 ; AVX512DQ-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
6518 ; AVX512DQ-NEXT: vmovdqa 256(%rdi), %xmm3
6519 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6520 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
6521 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm11[0],xmm7[1],xmm11[1]
6522 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
6523 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm16
6524 ; AVX512DQ-NEXT: vmovdqa 480(%rdi), %ymm0
6525 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6526 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2]
6527 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,1,2,0,4,5,6,4]
6528 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
6529 ; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm21
6530 ; AVX512DQ-NEXT: vmovdqa 448(%rdi), %ymm2
6531 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6532 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm20 = ymm2[0,1,0,2]
6533 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm20[0,1,2,0,4,5,6,4]
6534 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
6535 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,6],ymm0[7]
6536 ; AVX512DQ-NEXT: vmovdqa 416(%rdi), %ymm2
6537 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6538 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm22 = ymm2[0,1,0,2]
6539 ; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm2
6540 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6541 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm2[0,1,0,2]
6542 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm22[0,2,2,3,4,6,6,7]
6543 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
6544 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[0,2,2,3,4,6,6,7]
6545 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm9[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6546 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm8[0,1,2,3,4],ymm2[5],ymm8[6,7]
6547 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
6548 ; AVX512DQ-NEXT: movb $-64, %al
6549 ; AVX512DQ-NEXT: kmovw %eax, %k1
6550 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm16 {%k1}
6551 ; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm2
6552 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6553 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm0
6554 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6555 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
6556 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm3
6557 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6558 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm2
6559 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6560 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
6561 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[2,2,2,2]
6562 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm8[3]
6563 ; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm23
6564 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm19 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
6565 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm24 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
6566 ; AVX512DQ-NEXT: vpermt2d %xmm0, %xmm1, %xmm2
6567 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
6568 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6569 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm1
6570 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6571 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm3
6572 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6573 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm4
6574 ; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6575 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
6576 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
6577 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm3[0],xmm8[0],xmm3[1],xmm8[1]
6578 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm15[0,1],xmm2[2,3]
6579 ; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm0
6580 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6581 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm31 = ymm0[0,1,0,2]
6582 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm0
6583 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6584 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm0[0,1,0,2]
6585 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm31[0,1,2,0,4,5,6,4]
6586 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
6587 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm30[0,1,2,0,4,5,6,4]
6588 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm14 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
6589 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm14[0,1,2,3,4,5,6],ymm0[7]
6590 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm0
6591 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6592 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm14 = ymm0[0,1,0,2]
6593 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %ymm28
6594 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm17 = ymm28[0,1,0,2]
6595 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm14[0,2,2,3,4,6,6,7]
6596 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
6597 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm17[0,2,2,3,4,6,6,7]
6598 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6599 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
6600 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
6601 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
6602 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm16, %zmm0
6603 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6604 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
6605 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm11[1],xmm0[2,3]
6606 ; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm4
6607 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
6608 ; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm4
6609 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
6610 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm12[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
6611 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
6612 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6613 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
6614 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4],ymm6[5],ymm9[6,7]
6615 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
6616 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
6617 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm0 {%k1}
6618 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm15[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
6619 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
6620 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
6621 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6622 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
6623 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
6624 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
6625 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[1,1,1,1]
6626 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0],xmm8[1],xmm2[2,3]
6627 ; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm4
6628 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
6629 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6630 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
6631 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6632 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm7[2],xmm11[2],xmm7[3],xmm11[3]
6633 ; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm1
6634 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6635 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
6636 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[0,1,1,3,4,5,5,7]
6637 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
6638 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm20[0,1,1,3,4,5,5,7]
6639 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
6640 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5,6],ymm2[7]
6641 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm22[3,1,2,3,7,5,6,7]
6642 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
6643 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[3,1,2,3,7,5,6,7]
6644 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm10 = ymm9[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
6645 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm10[0,1,2,3,4],ymm6[5],ymm10[6,7]
6646 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
6647 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
6648 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm31[0,1,1,3,4,5,5,7]
6649 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
6650 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm30[0,1,1,3,4,5,5,7]
6651 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm12 = ymm10[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
6652 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3,4,5,6],ymm6[7]
6653 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm14[3,1,2,3,7,5,6,7]
6654 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
6655 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm17[3,1,2,3,7,5,6,7]
6656 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
6657 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4],ymm13[5],ymm15[6,7]
6658 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5],ymm6[6,7]
6659 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm13 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
6660 ; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm15
6661 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm13 = xmm13[0,1],xmm15[2,3]
6662 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7]
6663 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm6, %zmm0, %zmm0
6664 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6665 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [3,7,0,0]
6666 ; AVX512DQ-NEXT: vpermt2d %xmm11, %xmm6, %xmm7
6667 ; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm0
6668 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm7[0,1],xmm0[2,3]
6669 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
6670 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
6671 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
6672 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
6673 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
6674 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm9[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
6675 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
6676 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
6677 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
6678 ; AVX512DQ-NEXT: vpermt2d %xmm8, %xmm6, %xmm3
6679 ; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm1
6680 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
6681 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
6682 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
6683 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
6684 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
6685 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
6686 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
6687 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
6688 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
6689 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
6690 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6691 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6692 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
6693 ; AVX512DQ-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
6694 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6695 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
6696 ; AVX512DQ-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
6697 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
6698 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
6699 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm30
6700 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm22 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
6701 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm31 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6702 ; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm2
6703 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
6704 ; AVX512DQ-NEXT: vpermt2d %xmm0, %xmm1, %xmm2
6705 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6706 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
6707 ; AVX512DQ-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
6708 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6709 ; AVX512DQ-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm4 # 16-byte Folded Reload
6710 ; AVX512DQ-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
6711 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
6712 ; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm20
6713 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
6714 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm12
6715 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
6716 ; AVX512DQ-NEXT: # ymm19 = mem[0,1,1,3]
6717 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm19[0,1,2,0,4,5,6,4]
6718 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
6719 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
6720 ; AVX512DQ-NEXT: # ymm21 = mem[0,1,1,3]
6721 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm21[0,1,2,0,4,5,6,4]
6722 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
6723 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
6724 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
6725 ; AVX512DQ-NEXT: # ymm29 = mem[0,1,1,3]
6726 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm23 # 32-byte Folded Reload
6727 ; AVX512DQ-NEXT: # ymm23 = mem[0,1,1,3]
6728 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[0,2,2,3,4,6,6,7]
6729 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
6730 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[0,2,2,3,4,6,6,7]
6731 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6732 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
6733 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
6734 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm12 {%k1}
6735 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6736 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
6737 ; AVX512DQ-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
6738 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
6739 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
6740 ; AVX512DQ-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
6741 ; AVX512DQ-NEXT: vmovdqa %xmm1, %xmm0
6742 ; AVX512DQ-NEXT: vpermi2d %xmm2, %xmm3, %xmm0
6743 ; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm16
6744 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm18
6745 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6746 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm8 # 16-byte Folded Reload
6747 ; AVX512DQ-NEXT: # xmm8 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
6748 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
6749 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
6750 ; AVX512DQ-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
6751 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm5[0],xmm8[0],xmm5[1],xmm8[1]
6752 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm15 = xmm15[0,1],xmm0[2,3]
6753 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm17 # 32-byte Folded Reload
6754 ; AVX512DQ-NEXT: # ymm17 = mem[0,1,1,3]
6755 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm17[0,1,2,0,4,5,6,4]
6756 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
6757 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm24 # 32-byte Folded Reload
6758 ; AVX512DQ-NEXT: # ymm24 = mem[0,1,1,3]
6759 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm24[0,1,2,0,4,5,6,4]
6760 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
6761 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm0[7]
6762 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
6763 ; AVX512DQ-NEXT: # ymm25 = mem[0,1,1,3]
6764 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm26 = ymm28[0,1,1,3]
6765 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm25[0,2,2,3,4,6,6,7]
6766 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
6767 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm26[0,2,2,3,4,6,6,7]
6768 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6769 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4],ymm0[5],ymm6[6,7]
6770 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
6771 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3],ymm0[4,5,6,7]
6772 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm27
6773 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
6774 ; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm12
6775 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3]
6776 ; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm2
6777 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
6778 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
6779 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm11[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
6780 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5,6],ymm2[7]
6781 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6782 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
6783 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5],ymm7[6,7]
6784 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
6785 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
6786 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
6787 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
6788 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
6789 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
6790 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6791 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
6792 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
6793 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
6794 ; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm9
6795 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm16[0],xmm18[0],xmm16[1],xmm18[1]
6796 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[1,1,1,1]
6797 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3]
6798 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
6799 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6800 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm20
6801 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm4[2],xmm12[2],xmm4[3],xmm12[3]
6802 ; AVX512DQ-NEXT: vmovdqa64 %xmm12, %xmm16
6803 ; AVX512DQ-NEXT: vmovdqa64 %xmm30, %xmm1
6804 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6805 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm1
6806 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm19[0,1,1,3,4,5,5,7]
6807 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
6808 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm21[0,1,1,3,4,5,5,7]
6809 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
6810 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6],ymm0[7]
6811 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm29[3,1,2,3,7,5,6,7]
6812 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
6813 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm23[3,1,2,3,7,5,6,7]
6814 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
6815 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4],ymm7[5],ymm11[6,7]
6816 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm0[6,7]
6817 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
6818 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm17[0,1,1,3,4,5,5,7]
6819 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm11 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
6820 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm24[0,1,1,3,4,5,5,7]
6821 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm13 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
6822 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5,6],ymm11[7]
6823 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[3,1,2,3,7,5,6,7]
6824 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm14 = ymm13[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
6825 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm26[3,1,2,3,7,5,6,7]
6826 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm15[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
6827 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4],ymm14[5],ymm12[6,7]
6828 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4,5],ymm11[6,7]
6829 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm18[2,2,2,2]
6830 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm12 = xmm9[0,1,2],xmm12[3]
6831 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm5[2],xmm8[2],xmm5[3],xmm8[3]
6832 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
6833 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
6834 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm11, %zmm1, %zmm1
6835 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm12 = [3,7,0,0]
6836 ; AVX512DQ-NEXT: vpermt2d %xmm16, %xmm12, %xmm4
6837 ; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm11
6838 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3]
6839 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm4, %zmm0, %zmm4
6840 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
6841 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
6842 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
6843 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
6844 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
6845 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
6846 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
6847 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm4 {%k1}
6848 ; AVX512DQ-NEXT: vpermt2d %xmm8, %xmm12, %xmm5
6849 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm18[2],xmm9[3],xmm18[3]
6850 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
6851 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
6852 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
6853 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7]
6854 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
6855 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
6856 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4],ymm3[5],ymm5[6,7]
6857 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
6858 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6859 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm4, %zmm0
6860 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6861 ; AVX512DQ-NEXT: vmovaps %zmm2, (%rsi)
6862 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6863 ; AVX512DQ-NEXT: vmovaps %zmm2, (%rdx)
6864 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6865 ; AVX512DQ-NEXT: vmovaps %zmm2, (%rcx)
6866 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6867 ; AVX512DQ-NEXT: vmovaps %zmm2, (%r8)
6868 ; AVX512DQ-NEXT: vmovdqa64 %zmm27, (%r9)
6869 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
6870 ; AVX512DQ-NEXT: vmovdqa64 %zmm20, (%rax)
6871 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
6872 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax)
6873 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
6874 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rax)
6875 ; AVX512DQ-NEXT: addq $616, %rsp # imm = 0x268
6876 ; AVX512DQ-NEXT: vzeroupper
6877 ; AVX512DQ-NEXT: retq
6879 ; AVX512DQ-FCP-LABEL: load_i16_stride8_vf32:
6880 ; AVX512DQ-FCP: # %bb.0:
6881 ; AVX512DQ-FCP-NEXT: subq $552, %rsp # imm = 0x228
6882 ; AVX512DQ-FCP-NEXT: vmovdqa 368(%rdi), %xmm0
6883 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6884 ; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %xmm1
6885 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6886 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6887 ; AVX512DQ-FCP-NEXT: vmovdqa 336(%rdi), %xmm0
6888 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6889 ; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %xmm1
6890 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6891 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6892 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,0,0,4]
6893 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, %xmm0
6894 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm16
6895 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm1, %xmm0
6896 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm22
6897 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm2
6898 ; AVX512DQ-FCP-NEXT: vmovdqa 304(%rdi), %xmm1
6899 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6900 ; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %xmm3
6901 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6902 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
6903 ; AVX512DQ-FCP-NEXT: vmovdqa 272(%rdi), %xmm1
6904 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
6905 ; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %xmm3
6906 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6907 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
6908 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm4[0],xmm9[1],xmm4[1]
6909 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm20
6910 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
6911 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm11
6912 ; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %ymm0
6913 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6914 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm21 = ymm0[0,1,0,2]
6915 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm21[0,1,2,0,4,5,6,4]
6916 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
6917 ; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm1
6918 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6919 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm27 = ymm1[0,1,0,2]
6920 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm27[0,1,2,0,4,5,6,4]
6921 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
6922 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
6923 ; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm1
6924 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6925 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm28 = ymm1[0,1,0,2]
6926 ; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm1
6927 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6928 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2]
6929 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm28[0,2,2,3,4,6,6,7]
6930 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
6931 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[0,2,2,3,4,6,6,7]
6932 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6933 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
6934 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
6935 ; AVX512DQ-FCP-NEXT: movb $-64, %al
6936 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
6937 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm11 {%k1}
6938 ; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
6939 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6940 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
6941 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6942 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6943 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm0
6944 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6945 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm1
6946 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6947 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
6948 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm13, %xmm0
6949 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm3, %xmm2, %xmm0
6950 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm25
6951 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1
6952 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6953 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
6954 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6955 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
6956 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6957 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm4
6958 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
6959 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
6960 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
6961 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
6962 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
6963 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm1[0,1],xmm0[2,3]
6964 ; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm0
6965 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6966 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm30 = ymm0[0,1,0,2]
6967 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm23
6968 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm31 = ymm23[0,1,0,2]
6969 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm30[0,1,2,0,4,5,6,4]
6970 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
6971 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm31[0,1,2,0,4,5,6,4]
6972 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
6973 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,6],ymm0[7]
6974 ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm0
6975 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6976 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm19 = ymm0[0,1,0,2]
6977 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm0
6978 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
6979 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm18 = ymm0[0,1,0,2]
6980 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,2,2,3,4,6,6,7]
6981 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm14[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
6982 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm18[0,2,2,3,4,6,6,7]
6983 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
6984 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
6985 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6986 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
6987 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm11, %zmm0
6988 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6989 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm11 = [1,5,0,0]
6990 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, %xmm0
6991 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm20, %xmm11, %xmm0
6992 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm16[0],xmm22[0],xmm16[1],xmm22[1]
6993 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
6994 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
6995 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
6996 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5,6],ymm1[7]
6997 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
6998 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
6999 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4],ymm5[5],ymm7[6,7]
7000 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
7001 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3,4,5],ymm1[6,7]
7002 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
7003 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, %xmm1
7004 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm17, %xmm11, %xmm1
7005 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm13[0],xmm25[0],xmm13[1],xmm25[1]
7006 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3]
7007 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
7008 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
7009 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
7010 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm14[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
7011 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
7012 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
7013 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
7014 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
7015 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
7016 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7017 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm2
7018 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm16[2],xmm22[2],xmm16[3],xmm22[3]
7019 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6]
7020 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm22, %xmm0, %xmm2
7021 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm9[2],xmm20[2],xmm9[3],xmm20[3]
7022 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
7023 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm2, %zmm0, %zmm3
7024 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[0,1,1,3,4,5,5,7]
7025 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
7026 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm27[0,1,1,3,4,5,5,7]
7027 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm4[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
7028 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm5[7]
7029 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm28[3,1,2,3,7,5,6,7]
7030 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
7031 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm29[3,1,2,3,7,5,6,7]
7032 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm8[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
7033 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm12[0,1,2,3,4],ymm10[5],ymm12[6,7]
7034 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
7035 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm0, %zmm3 {%k1}
7036 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm13, %xmm7
7037 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm13[2],xmm25[2],xmm13[3],xmm25[3]
7038 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm25, %xmm0, %xmm7
7039 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm12 = xmm6[2],xmm17[2],xmm6[3],xmm17[3]
7040 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm7 = xmm12[0,1],xmm7[2,3]
7041 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[0,1,1,3,4,5,5,7]
7042 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
7043 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm31[0,1,1,3,4,5,5,7]
7044 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
7045 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7]
7046 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[3,1,2,3,7,5,6,7]
7047 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
7048 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm18[3,1,2,3,7,5,6,7]
7049 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
7050 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7]
7051 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7]
7052 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
7053 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
7054 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
7055 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm27 = [3,7,0,0]
7056 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm20, %xmm27, %xmm9
7057 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm0
7058 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
7059 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
7060 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
7061 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
7062 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
7063 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
7064 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
7065 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
7066 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
7067 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 {%k1}
7068 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm17, %xmm27, %xmm6
7069 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm6[0,1],xmm10[2,3]
7070 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
7071 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
7072 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
7073 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
7074 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
7075 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
7076 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
7077 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
7078 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm28
7079 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7080 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
7081 ; AVX512DQ-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
7082 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7083 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
7084 ; AVX512DQ-FCP-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
7085 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7086 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
7087 ; AVX512DQ-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
7088 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
7089 ; AVX512DQ-FCP-NEXT: vpunpckhwd (%rsp), %xmm0, %xmm5 # 16-byte Folded Reload
7090 ; AVX512DQ-FCP-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
7091 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, %xmm1
7092 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,0,4]
7093 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm3, %xmm0, %xmm1
7094 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm24
7095 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
7096 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm18
7097 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
7098 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm31
7099 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
7100 ; AVX512DQ-FCP-NEXT: # ymm29 = mem[0,1,1,3]
7101 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm29[0,1,2,0,4,5,6,4]
7102 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm26 # 32-byte Folded Reload
7103 ; AVX512DQ-FCP-NEXT: # ymm26 = mem[0,1,1,3]
7104 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
7105 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,1,2,0,4,5,6,4]
7106 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
7107 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
7108 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
7109 ; AVX512DQ-FCP-NEXT: # ymm30 = mem[0,1,1,3]
7110 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload
7111 ; AVX512DQ-FCP-NEXT: # ymm25 = mem[0,1,1,3]
7112 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm30[0,2,2,3,4,6,6,7]
7113 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
7114 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm25[0,2,2,3,4,6,6,7]
7115 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm13[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
7116 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4],ymm4[5],ymm7[6,7]
7117 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
7118 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm31 {%k1}
7119 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7120 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
7121 ; AVX512DQ-FCP-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
7122 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7123 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
7124 ; AVX512DQ-FCP-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
7125 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7126 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
7127 ; AVX512DQ-FCP-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
7128 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm0
7129 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
7130 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm22
7131 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
7132 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm7 # 16-byte Folded Reload
7133 ; AVX512DQ-FCP-NEXT: # xmm7 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
7134 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
7135 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm16
7136 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm11 = xmm10[0,1],xmm0[2,3]
7137 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm20 # 32-byte Folded Reload
7138 ; AVX512DQ-FCP-NEXT: # ymm20 = mem[0,1,1,3]
7139 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm20[0,1,2,0,4,5,6,4]
7140 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm23 = ymm23[0,1,1,3]
7141 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
7142 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,1,2,0,4,5,6,4]
7143 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm15 = ymm14[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
7144 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4,5,6],ymm0[7]
7145 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm19 # 32-byte Folded Reload
7146 ; AVX512DQ-FCP-NEXT: # ymm19 = mem[0,1,1,3]
7147 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm21 # 32-byte Folded Reload
7148 ; AVX512DQ-FCP-NEXT: # ymm21 = mem[0,1,1,3]
7149 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm19[0,2,2,3,4,6,6,7]
7150 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
7151 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm21[0,2,2,3,4,6,6,7]
7152 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
7153 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
7154 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
7155 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
7156 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm31, %zmm31
7157 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm0
7158 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm4 = [1,5,0,0]
7159 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm18, %xmm4, %xmm0
7160 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm24[0],xmm12[1],xmm24[1]
7161 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
7162 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
7163 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
7164 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
7165 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
7166 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm13[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
7167 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
7168 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
7169 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
7170 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
7171 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, %xmm3
7172 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm16, %xmm7, %xmm3
7173 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm22[0],xmm17[1],xmm22[1]
7174 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm3[0,1],xmm1[2,3]
7175 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
7176 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm14[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
7177 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
7178 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
7179 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
7180 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
7181 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5],ymm1[6,7]
7182 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm12, %xmm8
7183 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm2
7184 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm10 = xmm12[2],xmm24[2],xmm12[3],xmm24[3]
7185 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm9[4,5,6,7]
7186 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm24
7187 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,0,2,6]
7188 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm0, %xmm8
7189 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm5[2],xmm18[2],xmm5[3],xmm18[3]
7190 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm8[2,3]
7191 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
7192 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[0,1,1,3,4,5,5,7]
7193 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
7194 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm26[0,1,1,3,4,5,5,7]
7195 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
7196 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5,6],ymm3[7]
7197 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm30[3,1,2,3,7,5,6,7]
7198 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
7199 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm25[3,1,2,3,7,5,6,7]
7200 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
7201 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4],ymm9[5],ymm12[6,7]
7202 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm3[6,7]
7203 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1}
7204 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm22, %xmm17, %xmm0
7205 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm7[2],xmm16[2],xmm7[3],xmm16[3]
7206 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm0[2,3]
7207 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm20[0,1,1,3,4,5,5,7]
7208 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm12 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
7209 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm13 = ymm23[0,1,1,3,4,5,5,7]
7210 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm14 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
7211 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm14[0,1,2,3,4,5,6],ymm12[7]
7212 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[3,1,2,3,7,5,6,7]
7213 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm14[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
7214 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm21[3,1,2,3,7,5,6,7]
7215 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm0[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
7216 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm15[5],ymm4[6,7]
7217 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm12[6,7]
7218 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
7219 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm1
7220 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm18, %xmm27, %xmm5
7221 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm10[2,3]
7222 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3
7223 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
7224 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
7225 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
7226 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
7227 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
7228 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
7229 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
7230 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm3 {%k1}
7231 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm16, %xmm27, %xmm7
7232 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm22[2],xmm17[3],xmm22[3]
7233 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm7[0,1],xmm2[2,3]
7234 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
7235 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
7236 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
7237 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm14[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
7238 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
7239 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm5[5],ymm0[6,7]
7240 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm4[6,7]
7241 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
7242 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
7243 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
7244 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rsi)
7245 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
7246 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rdx)
7247 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
7248 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rcx)
7249 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, (%r8)
7250 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, (%r9)
7251 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7252 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, (%rax)
7253 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7254 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, (%rax)
7255 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7256 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
7257 ; AVX512DQ-FCP-NEXT: addq $552, %rsp # imm = 0x228
7258 ; AVX512DQ-FCP-NEXT: vzeroupper
7259 ; AVX512DQ-FCP-NEXT: retq
7261 ; AVX512BW-LABEL: load_i16_stride8_vf32:
7262 ; AVX512BW: # %bb.0:
7263 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
7264 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
7265 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
7266 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
7267 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2
7268 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1
7269 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm4
7270 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm5
7271 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3
7272 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7
7273 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6
7274 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
7275 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7276 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm9
7277 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm9
7278 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
7279 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm8, %zmm10
7280 ; AVX512BW-NEXT: movb $-64, %dil
7281 ; AVX512BW-NEXT: kmovd %edi, %k1
7282 ; AVX512BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
7283 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm9
7284 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm9
7285 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm8
7286 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
7287 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
7288 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
7289 ; AVX512BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7290 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10
7291 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm9, %zmm10
7292 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm11
7293 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm9, %zmm11
7294 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1}
7295 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10
7296 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm9, %zmm10
7297 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm9
7298 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
7299 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
7300 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
7301 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7302 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm11
7303 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm11
7304 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm12
7305 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm10, %zmm12
7306 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1}
7307 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm11
7308 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm11
7309 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm10
7310 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
7311 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10
7312 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
7313 ; AVX512BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7314 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm12
7315 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm11, %zmm12
7316 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13
7317 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm13
7318 ; AVX512BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1}
7319 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm12
7320 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm11, %zmm12
7321 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm11
7322 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
7323 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11
7324 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
7325 ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7326 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm13
7327 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm12, %zmm13
7328 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm14
7329 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm12, %zmm14
7330 ; AVX512BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
7331 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm13
7332 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm12, %zmm13
7333 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm12
7334 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
7335 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12
7336 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
7337 ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7338 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm14
7339 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm14
7340 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm15
7341 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm15
7342 ; AVX512BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
7343 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm14
7344 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm13, %zmm14
7345 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm13
7346 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
7347 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
7348 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
7349 ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7350 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm15
7351 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm14, %zmm15
7352 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm16
7353 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm14, %zmm16
7354 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1}
7355 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm15
7356 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm14, %zmm15
7357 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm14
7358 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
7359 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14
7360 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
7361 ; AVX512BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7362 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm6
7363 ; AVX512BW-NEXT: vpermt2w %zmm5, %zmm15, %zmm3
7364 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
7365 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm15, %zmm1
7366 ; AVX512BW-NEXT: vpermt2w %zmm2, %zmm15, %zmm0
7367 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7368 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
7369 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi)
7370 ; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rdx)
7371 ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rcx)
7372 ; AVX512BW-NEXT: vmovdqa64 %zmm11, (%r8)
7373 ; AVX512BW-NEXT: vmovdqa64 %zmm12, (%r9)
7374 ; AVX512BW-NEXT: vmovdqa64 %zmm13, (%r11)
7375 ; AVX512BW-NEXT: vmovdqa64 %zmm14, (%r10)
7376 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax)
7377 ; AVX512BW-NEXT: vzeroupper
7378 ; AVX512BW-NEXT: retq
7380 ; AVX512BW-FCP-LABEL: load_i16_stride8_vf32:
7381 ; AVX512BW-FCP: # %bb.0:
7382 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7383 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
7384 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
7385 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
7386 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
7387 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
7388 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4
7389 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5
7390 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3
7391 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
7392 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6
7393 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
7394 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7395 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9
7396 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm8, %zmm9
7397 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
7398 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm8, %zmm10
7399 ; AVX512BW-FCP-NEXT: movb $-64, %dil
7400 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1
7401 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
7402 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9
7403 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm9
7404 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm8
7405 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
7406 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
7407 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
7408 ; AVX512BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7409 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10
7410 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm9, %zmm10
7411 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11
7412 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm9, %zmm11
7413 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1}
7414 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
7415 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm9, %zmm10
7416 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm9
7417 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
7418 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
7419 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
7420 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7421 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
7422 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm10, %zmm11
7423 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
7424 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm10, %zmm12
7425 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1}
7426 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11
7427 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm11
7428 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm10
7429 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
7430 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10
7431 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
7432 ; AVX512BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7433 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12
7434 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm11, %zmm12
7435 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
7436 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm11, %zmm13
7437 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1}
7438 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12
7439 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm11, %zmm12
7440 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm11
7441 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
7442 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11
7443 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
7444 ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7445 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
7446 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm12, %zmm13
7447 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14
7448 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm12, %zmm14
7449 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
7450 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13
7451 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm12, %zmm13
7452 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm12
7453 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
7454 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12
7455 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
7456 ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7457 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14
7458 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm13, %zmm14
7459 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15
7460 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm13, %zmm15
7461 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
7462 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14
7463 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm13, %zmm14
7464 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm13
7465 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
7466 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
7467 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
7468 ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7469 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15
7470 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm14, %zmm15
7471 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm16
7472 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm14, %zmm16
7473 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1}
7474 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15
7475 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm14, %zmm15
7476 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm14
7477 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
7478 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14
7479 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
7480 ; AVX512BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7481 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm15, %zmm6
7482 ; AVX512BW-FCP-NEXT: vpermt2w %zmm5, %zmm15, %zmm3
7483 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
7484 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm15, %zmm1
7485 ; AVX512BW-FCP-NEXT: vpermt2w %zmm2, %zmm15, %zmm0
7486 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7487 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
7488 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi)
7489 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdx)
7490 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx)
7491 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, (%r8)
7492 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%r9)
7493 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%r11)
7494 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%r10)
7495 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
7496 ; AVX512BW-FCP-NEXT: vzeroupper
7497 ; AVX512BW-FCP-NEXT: retq
7499 ; AVX512DQ-BW-LABEL: load_i16_stride8_vf32:
7500 ; AVX512DQ-BW: # %bb.0:
7501 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
7502 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
7503 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
7504 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
7505 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2
7506 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1
7507 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm4
7508 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm5
7509 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm3
7510 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm7
7511 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm6
7512 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
7513 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7514 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm9
7515 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm9
7516 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10
7517 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm8, %zmm10
7518 ; AVX512DQ-BW-NEXT: movb $-64, %dil
7519 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1
7520 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
7521 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm9
7522 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm9
7523 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm8
7524 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
7525 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
7526 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
7527 ; AVX512DQ-BW-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7528 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10
7529 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm9, %zmm10
7530 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm11
7531 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm9, %zmm11
7532 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1}
7533 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10
7534 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm9, %zmm10
7535 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm9
7536 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
7537 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
7538 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
7539 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7540 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm11
7541 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm11
7542 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm12
7543 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm10, %zmm12
7544 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1}
7545 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm11
7546 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm11
7547 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm10
7548 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
7549 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10
7550 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
7551 ; AVX512DQ-BW-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7552 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm12
7553 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm11, %zmm12
7554 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13
7555 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm11, %zmm13
7556 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1}
7557 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm12
7558 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm11, %zmm12
7559 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm11
7560 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
7561 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11
7562 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
7563 ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7564 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm13
7565 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm12, %zmm13
7566 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm14
7567 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm12, %zmm14
7568 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
7569 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm13
7570 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm12, %zmm13
7571 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm12
7572 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
7573 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12
7574 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
7575 ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7576 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm14
7577 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm14
7578 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm15
7579 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm13, %zmm15
7580 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
7581 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm14
7582 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm13, %zmm14
7583 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm13
7584 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
7585 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
7586 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
7587 ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7588 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm15
7589 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm14, %zmm15
7590 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm16
7591 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm14, %zmm16
7592 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1}
7593 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm15
7594 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm14, %zmm15
7595 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm0, %zmm14
7596 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
7597 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14
7598 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
7599 ; AVX512DQ-BW-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7600 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm15, %zmm6
7601 ; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm15, %zmm3
7602 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
7603 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm15, %zmm1
7604 ; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm15, %zmm0
7605 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7606 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
7607 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rsi)
7608 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rdx)
7609 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rcx)
7610 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, (%r8)
7611 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%r9)
7612 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, (%r11)
7613 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%r10)
7614 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax)
7615 ; AVX512DQ-BW-NEXT: vzeroupper
7616 ; AVX512DQ-BW-NEXT: retq
7618 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride8_vf32:
7619 ; AVX512DQ-BW-FCP: # %bb.0:
7620 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
7621 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
7622 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
7623 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
7624 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
7625 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
7626 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm4
7627 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm5
7628 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3
7629 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
7630 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6
7631 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
7632 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7633 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm9
7634 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm8, %zmm9
7635 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
7636 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm8, %zmm10
7637 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %dil
7638 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1
7639 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, %zmm10 {%k1}
7640 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm9
7641 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm9
7642 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm8
7643 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm9[4,5,6,7]
7644 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
7645 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm9 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
7646 ; AVX512DQ-BW-FCP-NEXT: # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7647 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10
7648 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm9, %zmm10
7649 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm11
7650 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm9, %zmm11
7651 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm11 {%k1}
7652 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
7653 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm9, %zmm10
7654 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm9
7655 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
7656 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm9, %zmm11, %zmm9
7657 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
7658 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7659 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm11
7660 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm10, %zmm11
7661 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm12
7662 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm10, %zmm12
7663 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm12 {%k1}
7664 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm11
7665 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm11
7666 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm10
7667 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3],ymm11[4,5,6,7]
7668 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm10, %zmm12, %zmm10
7669 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm11 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
7670 ; AVX512DQ-BW-FCP-NEXT: # zmm11 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7671 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm12
7672 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm11, %zmm12
7673 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
7674 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm11, %zmm13
7675 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, %zmm13 {%k1}
7676 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm12
7677 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm11, %zmm12
7678 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm11
7679 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
7680 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm11, %zmm13, %zmm11
7681 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm12 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
7682 ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7683 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm13
7684 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm12, %zmm13
7685 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm14
7686 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm12, %zmm14
7687 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, %zmm14 {%k1}
7688 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm13
7689 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm12, %zmm13
7690 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm12
7691 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
7692 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm12, %zmm14, %zmm12
7693 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
7694 ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7695 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm14
7696 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm13, %zmm14
7697 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm15
7698 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm13, %zmm15
7699 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, %zmm15 {%k1}
7700 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm14
7701 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm13, %zmm14
7702 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm13
7703 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3],ymm14[4,5,6,7]
7704 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm13, %zmm15, %zmm13
7705 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm14 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
7706 ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7707 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm15
7708 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm14, %zmm15
7709 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm16
7710 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm14, %zmm16
7711 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm16 {%k1}
7712 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm15
7713 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm14, %zmm15
7714 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm0, %zmm14
7715 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3],ymm15[4,5,6,7]
7716 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm14, %zmm16, %zmm14
7717 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
7718 ; AVX512DQ-BW-FCP-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
7719 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm15, %zmm6
7720 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm5, %zmm15, %zmm3
7721 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
7722 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm15, %zmm1
7723 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm2, %zmm15, %zmm0
7724 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
7725 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
7726 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi)
7727 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdx)
7728 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rcx)
7729 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, (%r8)
7730 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%r9)
7731 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%r11)
7732 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%r10)
7733 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
7734 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
7735 ; AVX512DQ-BW-FCP-NEXT: retq
7736 %wide.vec = load <256 x i16>, ptr %in.vec, align 64
7737 %strided.vec0 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120, i32 128, i32 136, i32 144, i32 152, i32 160, i32 168, i32 176, i32 184, i32 192, i32 200, i32 208, i32 216, i32 224, i32 232, i32 240, i32 248>
7738 %strided.vec1 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <32 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121, i32 129, i32 137, i32 145, i32 153, i32 161, i32 169, i32 177, i32 185, i32 193, i32 201, i32 209, i32 217, i32 225, i32 233, i32 241, i32 249>
7739 %strided.vec2 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <32 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 66, i32 74, i32 82, i32 90, i32 98, i32 106, i32 114, i32 122, i32 130, i32 138, i32 146, i32 154, i32 162, i32 170, i32 178, i32 186, i32 194, i32 202, i32 210, i32 218, i32 226, i32 234, i32 242, i32 250>
7740 %strided.vec3 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <32 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 67, i32 75, i32 83, i32 91, i32 99, i32 107, i32 115, i32 123, i32 131, i32 139, i32 147, i32 155, i32 163, i32 171, i32 179, i32 187, i32 195, i32 203, i32 211, i32 219, i32 227, i32 235, i32 243, i32 251>
7741 %strided.vec4 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <32 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 68, i32 76, i32 84, i32 92, i32 100, i32 108, i32 116, i32 124, i32 132, i32 140, i32 148, i32 156, i32 164, i32 172, i32 180, i32 188, i32 196, i32 204, i32 212, i32 220, i32 228, i32 236, i32 244, i32 252>
7742 %strided.vec5 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <32 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125, i32 133, i32 141, i32 149, i32 157, i32 165, i32 173, i32 181, i32 189, i32 197, i32 205, i32 213, i32 221, i32 229, i32 237, i32 245, i32 253>
7743 %strided.vec6 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <32 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126, i32 134, i32 142, i32 150, i32 158, i32 166, i32 174, i32 182, i32 190, i32 198, i32 206, i32 214, i32 222, i32 230, i32 238, i32 246, i32 254>
7744 %strided.vec7 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <32 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127, i32 135, i32 143, i32 151, i32 159, i32 167, i32 175, i32 183, i32 191, i32 199, i32 207, i32 215, i32 223, i32 231, i32 239, i32 247, i32 255>
7745 store <32 x i16> %strided.vec0, ptr %out.vec0, align 64
7746 store <32 x i16> %strided.vec1, ptr %out.vec1, align 64
7747 store <32 x i16> %strided.vec2, ptr %out.vec2, align 64
7748 store <32 x i16> %strided.vec3, ptr %out.vec3, align 64
7749 store <32 x i16> %strided.vec4, ptr %out.vec4, align 64
7750 store <32 x i16> %strided.vec5, ptr %out.vec5, align 64
7751 store <32 x i16> %strided.vec6, ptr %out.vec6, align 64
7752 store <32 x i16> %strided.vec7, ptr %out.vec7, align 64
7756 define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5, ptr %out.vec6, ptr %out.vec7) nounwind {
7757 ; SSE-LABEL: load_i16_stride8_vf64:
7759 ; SSE-NEXT: subq $1800, %rsp # imm = 0x708
7760 ; SSE-NEXT: movdqa 752(%rdi), %xmm2
7761 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7762 ; SSE-NEXT: movdqa 736(%rdi), %xmm3
7763 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7764 ; SSE-NEXT: movdqa 208(%rdi), %xmm1
7765 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7766 ; SSE-NEXT: movdqa 192(%rdi), %xmm4
7767 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7768 ; SSE-NEXT: movdqa 240(%rdi), %xmm5
7769 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7770 ; SSE-NEXT: movdqa 224(%rdi), %xmm6
7771 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7772 ; SSE-NEXT: movdqa 144(%rdi), %xmm7
7773 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7774 ; SSE-NEXT: movdqa 128(%rdi), %xmm8
7775 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7776 ; SSE-NEXT: movdqa 176(%rdi), %xmm9
7777 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7778 ; SSE-NEXT: movdqa 160(%rdi), %xmm0
7779 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7780 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3]
7781 ; SSE-NEXT: movdqa %xmm0, %xmm9
7782 ; SSE-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
7783 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
7784 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,0,0]
7785 ; SSE-NEXT: movdqa %xmm6, %xmm12
7786 ; SSE-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
7787 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7788 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,0,0,0]
7789 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7790 ; SSE-NEXT: movdqa %xmm8, %xmm0
7791 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7792 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
7793 ; SSE-NEXT: movdqa %xmm9, %xmm7
7794 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7795 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7796 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
7797 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7798 ; SSE-NEXT: movdqa 720(%rdi), %xmm0
7799 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7800 ; SSE-NEXT: movdqa 704(%rdi), %xmm1
7801 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7802 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7803 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7804 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0]
7805 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
7806 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7807 ; SSE-NEXT: movdqa 688(%rdi), %xmm2
7808 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7809 ; SSE-NEXT: movdqa 672(%rdi), %xmm3
7810 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7811 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
7812 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7813 ; SSE-NEXT: movdqa 656(%rdi), %xmm2
7814 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7815 ; SSE-NEXT: movdqa 640(%rdi), %xmm0
7816 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7817 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
7818 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7819 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7820 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7821 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7822 ; SSE-NEXT: movdqa 624(%rdi), %xmm0
7823 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7824 ; SSE-NEXT: movdqa 608(%rdi), %xmm2
7825 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7826 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
7827 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7828 ; SSE-NEXT: movdqa 592(%rdi), %xmm0
7829 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7830 ; SSE-NEXT: movdqa 576(%rdi), %xmm1
7831 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7832 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7833 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7834 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,0,0,0]
7835 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
7836 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7837 ; SSE-NEXT: movdqa 560(%rdi), %xmm2
7838 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7839 ; SSE-NEXT: movdqa 544(%rdi), %xmm3
7840 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7841 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
7842 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7843 ; SSE-NEXT: movdqa 528(%rdi), %xmm2
7844 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7845 ; SSE-NEXT: movdqa 512(%rdi), %xmm0
7846 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7847 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
7848 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7849 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7850 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7851 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7852 ; SSE-NEXT: movdqa 496(%rdi), %xmm0
7853 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7854 ; SSE-NEXT: movdqa 480(%rdi), %xmm1
7855 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7856 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7857 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7858 ; SSE-NEXT: movdqa 464(%rdi), %xmm0
7859 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7860 ; SSE-NEXT: movdqa 448(%rdi), %xmm10
7861 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7862 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
7863 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7864 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
7865 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0]
7866 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7867 ; SSE-NEXT: movdqa 432(%rdi), %xmm2
7868 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7869 ; SSE-NEXT: movdqa 416(%rdi), %xmm3
7870 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7871 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
7872 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7873 ; SSE-NEXT: movdqa 400(%rdi), %xmm2
7874 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7875 ; SSE-NEXT: movdqa 384(%rdi), %xmm0
7876 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7877 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
7878 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7879 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7880 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7881 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7882 ; SSE-NEXT: movdqa 1008(%rdi), %xmm0
7883 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7884 ; SSE-NEXT: movdqa 992(%rdi), %xmm1
7885 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7886 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7887 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7888 ; SSE-NEXT: movdqa 976(%rdi), %xmm0
7889 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7890 ; SSE-NEXT: movdqa 960(%rdi), %xmm13
7891 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7892 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
7893 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7894 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
7895 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm13[0,0,0,0]
7896 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7897 ; SSE-NEXT: movdqa 944(%rdi), %xmm2
7898 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7899 ; SSE-NEXT: movdqa 928(%rdi), %xmm3
7900 ; SSE-NEXT: movdqa %xmm3, (%rsp) # 16-byte Spill
7901 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
7902 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7903 ; SSE-NEXT: movdqa 912(%rdi), %xmm2
7904 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7905 ; SSE-NEXT: movdqa 896(%rdi), %xmm0
7906 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7907 ; SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
7908 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7909 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7910 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7911 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7912 ; SSE-NEXT: movdqa 368(%rdi), %xmm0
7913 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7914 ; SSE-NEXT: movdqa 352(%rdi), %xmm1
7915 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7916 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7917 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7918 ; SSE-NEXT: movdqa 336(%rdi), %xmm0
7919 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7920 ; SSE-NEXT: movdqa 320(%rdi), %xmm10
7921 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7922 ; SSE-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
7923 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7924 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
7925 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[0,0,0,0]
7926 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7927 ; SSE-NEXT: movdqa 304(%rdi), %xmm2
7928 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7929 ; SSE-NEXT: movdqa 288(%rdi), %xmm3
7930 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7931 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
7932 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7933 ; SSE-NEXT: movdqa 272(%rdi), %xmm0
7934 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7935 ; SSE-NEXT: movdqa 256(%rdi), %xmm14
7936 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7937 ; SSE-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
7938 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7939 ; SSE-NEXT: movdqa %xmm14, %xmm0
7940 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7941 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7942 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7943 ; SSE-NEXT: movdqa 880(%rdi), %xmm0
7944 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7945 ; SSE-NEXT: movdqa 864(%rdi), %xmm1
7946 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7947 ; SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7948 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7949 ; SSE-NEXT: movdqa 848(%rdi), %xmm0
7950 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7951 ; SSE-NEXT: movdqa 832(%rdi), %xmm9
7952 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7953 ; SSE-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
7954 ; SSE-NEXT: movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7955 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
7956 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[0,0,0,0]
7957 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
7958 ; SSE-NEXT: movdqa 816(%rdi), %xmm2
7959 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7960 ; SSE-NEXT: movdqa 800(%rdi), %xmm3
7961 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7962 ; SSE-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
7963 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7964 ; SSE-NEXT: movdqa 784(%rdi), %xmm0
7965 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7966 ; SSE-NEXT: movdqa 768(%rdi), %xmm15
7967 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7968 ; SSE-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
7969 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7970 ; SSE-NEXT: movdqa %xmm15, %xmm0
7971 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
7972 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
7973 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7974 ; SSE-NEXT: movdqa 112(%rdi), %xmm0
7975 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7976 ; SSE-NEXT: movdqa 96(%rdi), %xmm13
7977 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7978 ; SSE-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
7979 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7980 ; SSE-NEXT: movdqa 80(%rdi), %xmm0
7981 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7982 ; SSE-NEXT: movdqa 64(%rdi), %xmm5
7983 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7984 ; SSE-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
7985 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7986 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm13[0,0,0,0]
7987 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,0,0,0]
7988 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
7989 ; SSE-NEXT: movdqa 32(%rdi), %xmm2
7990 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7991 ; SSE-NEXT: movdqa 48(%rdi), %xmm1
7992 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7993 ; SSE-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
7994 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7995 ; SSE-NEXT: movdqa (%rdi), %xmm6
7996 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7997 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
7998 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
7999 ; SSE-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3]
8000 ; SSE-NEXT: movdqa %xmm6, %xmm1
8001 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8002 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
8003 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8004 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8005 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[1,1,1,1]
8006 ; SSE-NEXT: movdqa %xmm7, %xmm14
8007 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm7[1,1,1,1]
8008 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8009 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8010 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8011 ; SSE-NEXT: movdqa %xmm7, %xmm0
8012 ; SSE-NEXT: movdqa %xmm12, %xmm4
8013 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8014 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
8015 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8016 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8017 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8018 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm12[1,1,1,1]
8019 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
8020 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1]
8021 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8022 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8023 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8024 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
8025 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8026 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8027 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8028 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[1,1,1,1]
8029 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
8030 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[1,1,1,1]
8031 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8032 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8033 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8034 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1]
8035 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8036 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8037 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8038 ; SSE-NEXT: # xmm1 = mem[1,1,1,1]
8039 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8040 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
8041 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8042 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8043 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8044 ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
8045 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8046 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8047 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8048 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[1,1,1,1]
8049 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8050 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
8051 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8052 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8053 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8054 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
8055 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8056 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8057 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8058 ; SSE-NEXT: # xmm1 = mem[1,1,1,1]
8059 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8060 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
8061 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8062 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8063 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8064 ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
8065 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8066 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8067 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8068 ; SSE-NEXT: # xmm1 = mem[1,1,1,1]
8069 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8070 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
8071 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8072 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8073 ; SSE-NEXT: movaps %xmm5, %xmm0
8074 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8075 ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
8076 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8077 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8078 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1]
8079 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8080 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
8081 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8082 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8083 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8084 ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
8085 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8086 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8087 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2]
8088 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2]
8089 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8090 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8091 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
8092 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8093 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8094 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,2,2,2]
8095 ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8096 ; SSE-NEXT: # xmm1 = mem[2,2,2,2]
8097 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8098 ; SSE-NEXT: punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm13[2],xmm12[3],xmm13[3]
8099 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm12[0],xmm1[1]
8100 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8101 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[2,2,2,2]
8102 ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8103 ; SSE-NEXT: # xmm1 = mem[2,2,2,2]
8104 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8105 ; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm11[2],xmm10[3],xmm11[3]
8106 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm10[0],xmm1[1]
8107 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8108 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8109 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[2,2,2,2]
8110 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
8111 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm14[2,2,2,2]
8112 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8113 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8114 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8115 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
8116 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8117 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8118 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2]
8119 ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8120 ; SSE-NEXT: # xmm1 = mem[2,2,2,2]
8121 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8122 ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
8123 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm9[0],xmm1[1]
8124 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8125 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
8126 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[2,2,2,2]
8127 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8128 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm15[2,2,2,2]
8129 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8130 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8131 ; SSE-NEXT: movapd %xmm4, %xmm0
8132 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8133 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
8134 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8135 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8136 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
8137 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2]
8138 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,2,2,2]
8139 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8140 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8141 ; SSE-NEXT: movapd %xmm7, %xmm0
8142 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8143 ; SSE-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm8[2],xmm0[3],xmm8[3]
8144 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8145 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8146 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8147 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm9[2,2,2,2]
8148 ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8149 ; SSE-NEXT: # xmm0 = mem[2,2,2,2]
8150 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
8151 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8152 ; SSE-NEXT: movdqa %xmm5, %xmm1
8153 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
8154 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8155 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8156 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8157 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8158 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
8159 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8160 ; SSE-NEXT: # xmm1 = mem[3,3,3,3]
8161 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8162 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
8163 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8164 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
8165 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8166 ; SSE-NEXT: movdqa %xmm15, %xmm2
8167 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm13[2],xmm2[3],xmm13[3]
8168 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[3,3,3,3]
8169 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,3,3,3]
8170 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8171 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
8172 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8173 ; SSE-NEXT: punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm10[2],xmm14[3],xmm10[3]
8174 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8175 ; SSE-NEXT: # xmm1 = mem[3,3,3,3]
8176 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[3,3,3,3]
8177 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8178 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
8179 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8180 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8181 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8182 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
8183 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8184 ; SSE-NEXT: # xmm1 = mem[3,3,3,3]
8185 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8186 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
8187 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8188 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
8189 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8190 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8191 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
8192 ; SSE-NEXT: # xmm4 = xmm4[2],mem[2],xmm4[3],mem[3]
8193 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8194 ; SSE-NEXT: # xmm1 = mem[3,3,3,3]
8195 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8196 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
8197 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8198 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
8199 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8200 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8201 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm11[2],xmm2[3],xmm11[3]
8202 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[3,3,3,3]
8203 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[3,3,3,3]
8204 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8205 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
8206 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8207 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8208 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8209 ; SSE-NEXT: # xmm2 = xmm2[2],mem[2],xmm2[3],mem[3]
8210 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8211 ; SSE-NEXT: # xmm1 = mem[3,3,3,3]
8212 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8213 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
8214 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8215 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
8216 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8217 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8218 ; SSE-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm9[2],xmm2[3],xmm9[3]
8219 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[3,3,3,3]
8220 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[3,3,3,3]
8221 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8222 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
8223 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8224 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8225 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
8226 ; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
8227 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
8228 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
8229 ; SSE-NEXT: # xmm11 = xmm11[4],mem[4],xmm11[5],mem[5],xmm11[6],mem[6],xmm11[7],mem[7]
8230 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8231 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8232 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8233 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8234 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8235 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8236 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
8237 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8238 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8239 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
8240 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8241 ; SSE-NEXT: movdqa %xmm11, %xmm0
8242 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
8243 ; SSE-NEXT: movdqa %xmm6, %xmm13
8244 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8245 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8246 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
8247 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
8248 ; SSE-NEXT: # xmm14 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7]
8249 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8250 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8251 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8252 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8253 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8254 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8255 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8256 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8257 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8258 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8259 ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
8260 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8261 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8262 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0]
8263 ; SSE-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
8264 ; SSE-NEXT: movdqa %xmm2, %xmm0
8265 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1]
8266 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm0[0],xmm3[1]
8267 ; SSE-NEXT: movapd %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8268 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8269 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
8270 ; SSE-NEXT: # xmm10 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7]
8271 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8272 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
8273 ; SSE-NEXT: # xmm6 = xmm6[4],mem[4],xmm6[5],mem[5],xmm6[6],mem[6],xmm6[7],mem[7]
8274 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8275 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8276 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8277 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8278 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8279 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8280 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
8281 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8282 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8283 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
8284 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8285 ; SSE-NEXT: movdqa %xmm6, %xmm0
8286 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
8287 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8288 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8289 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8290 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
8291 ; SSE-NEXT: # xmm8 = xmm8[4],mem[4],xmm8[5],mem[5],xmm8[6],mem[6],xmm8[7],mem[7]
8292 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8293 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8294 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8295 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8296 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8297 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8298 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8299 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8300 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8301 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8302 ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
8303 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8304 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8305 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm3[0,0,0,0]
8306 ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm0[2],xmm7[3],xmm0[3]
8307 ; SSE-NEXT: movdqa %xmm2, %xmm0
8308 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
8309 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm0[0],xmm7[1]
8310 ; SSE-NEXT: movapd %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8311 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8312 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
8313 ; SSE-NEXT: # xmm4 = xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
8314 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8315 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
8316 ; SSE-NEXT: # xmm5 = xmm5[4],mem[4],xmm5[5],mem[5],xmm5[6],mem[6],xmm5[7],mem[7]
8317 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8318 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8319 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8320 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8321 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8322 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8323 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8324 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8325 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8326 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm2[0,0,0,0]
8327 ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3]
8328 ; SSE-NEXT: movdqa %xmm5, %xmm0
8329 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8330 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
8331 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8332 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1]
8333 ; SSE-NEXT: movapd %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8334 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8335 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8336 ; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7]
8337 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8338 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8339 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8340 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8341 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8342 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8343 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
8344 ; SSE-NEXT: # xmm12 = xmm12[4],mem[4],xmm12[5],mem[5],xmm12[6],mem[6],xmm12[7],mem[7]
8345 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8346 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8347 ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
8348 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8349 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[0,0,0,0]
8350 ; SSE-NEXT: movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8351 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[0,0,0,0]
8352 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8353 ; SSE-NEXT: movdqa %xmm2, %xmm0
8354 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
8355 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8356 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8357 ; SSE-NEXT: movdqa (%rsp), %xmm2 # 16-byte Reload
8358 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8359 ; SSE-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
8360 ; SSE-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill
8361 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8362 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
8363 ; SSE-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
8364 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8365 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8366 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8367 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8368 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8369 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8370 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8371 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
8372 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8373 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8374 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
8375 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8376 ; SSE-NEXT: movdqa %xmm3, %xmm0
8377 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
8378 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8379 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8380 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8381 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8382 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8383 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8384 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8385 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8386 ; SSE-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
8387 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8388 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8389 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
8390 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8391 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8392 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8393 ; SSE-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
8394 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8395 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8396 ; SSE-NEXT: punpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
8397 ; SSE-NEXT: # xmm7 = xmm7[4],mem[4],xmm7[5],mem[5],xmm7[6],mem[6],xmm7[7],mem[7]
8398 ; SSE-NEXT: movdqa %xmm7, %xmm2
8399 ; SSE-NEXT: movdqa %xmm7, %xmm3
8400 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8401 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
8402 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
8403 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8404 ; SSE-NEXT: movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8405 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm11[1,1,1,1]
8406 ; SSE-NEXT: movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8407 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[1,1,1,1]
8408 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8409 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
8410 ; SSE-NEXT: movaps %xmm13, %xmm0
8411 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
8412 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
8413 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8414 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8415 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8416 ; SSE-NEXT: # xmm2 = mem[1,1,1,1]
8417 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8418 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[1,1,1,1]
8419 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
8420 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
8421 ; SSE-NEXT: movaps %xmm7, %xmm0
8422 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
8423 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1]
8424 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm0[2,3]
8425 ; SSE-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8426 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8427 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[1,1,1,1]
8428 ; SSE-NEXT: movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8429 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
8430 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8431 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
8432 ; SSE-NEXT: movaps %xmm10, %xmm0
8433 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8434 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
8435 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8436 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8437 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8438 ; SSE-NEXT: # xmm1 = mem[1,1,1,1]
8439 ; SSE-NEXT: movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8440 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
8441 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8442 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
8443 ; SSE-NEXT: movaps %xmm8, %xmm0
8444 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8445 ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
8446 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8447 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8448 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm5[1,1,1,1]
8449 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[1,1,1,1]
8450 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8451 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8452 ; SSE-NEXT: unpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8453 ; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
8454 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8455 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8456 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8457 ; SSE-NEXT: # xmm1 = mem[1,1,1,1]
8458 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8459 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm15[1,1,1,1]
8460 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8461 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
8462 ; SSE-NEXT: movdqa %xmm6, %xmm0
8463 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1]
8464 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8465 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8466 ; SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8467 ; SSE-NEXT: # xmm1 = mem[1,1,1,1]
8468 ; SSE-NEXT: pshufd $85, (%rsp), %xmm0 # 16-byte Folded Reload
8469 ; SSE-NEXT: # xmm0 = mem[1,1,1,1]
8470 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8471 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8472 ; SSE-NEXT: movaps %xmm5, %xmm0
8473 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
8474 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
8475 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8476 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8477 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
8478 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8479 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm12[1,1,1,1]
8480 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8481 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
8482 ; SSE-NEXT: movaps %xmm3, %xmm0
8483 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
8484 ; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
8485 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
8486 ; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8487 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm11[2,2,2,2]
8488 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm13[2,2,2,2]
8489 ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3]
8490 ; SSE-NEXT: movdqa %xmm11, %xmm1
8491 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8492 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8493 ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
8494 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8495 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8496 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
8497 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm7[2,2,2,2]
8498 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8499 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8500 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8501 ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
8502 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8503 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8504 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm9[2,2,2,2]
8505 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm10[2,2,2,2]
8506 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8507 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8508 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8509 ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
8510 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8511 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8512 ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8513 ; SSE-NEXT: # xmm0 = mem[2,2,2,2]
8514 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm8[2,2,2,2]
8515 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8516 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8517 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8518 ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
8519 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
8520 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8521 ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8522 ; SSE-NEXT: # xmm0 = mem[2,2,2,2]
8523 ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
8524 ; SSE-NEXT: # xmm11 = mem[2,2,2,2]
8525 ; SSE-NEXT: punpckhdq {{.*#+}} xmm11 = xmm11[2],xmm0[2],xmm11[3],xmm0[3]
8526 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
8527 ; SSE-NEXT: movapd %xmm14, %xmm0
8528 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8529 ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
8530 ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm0[0],xmm11[1]
8531 ; SSE-NEXT: pshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8532 ; SSE-NEXT: # xmm0 = mem[2,2,2,2]
8533 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm6[2,2,2,2]
8534 ; SSE-NEXT: punpckhdq {{.*#+}} xmm10 = xmm10[2],xmm0[2],xmm10[3],xmm0[3]
8535 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8536 ; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm15[2],xmm0[3],xmm15[3]
8537 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm0[0],xmm10[1]
8538 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,2,2,2]
8539 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm5[2,2,2,2]
8540 ; SSE-NEXT: punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm0[2],xmm9[3],xmm0[3]
8541 ; SSE-NEXT: movapd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
8542 ; SSE-NEXT: unpckhps (%rsp), %xmm0 # 16-byte Folded Reload
8543 ; SSE-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
8544 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm0[0],xmm9[1]
8545 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,2,2,2]
8546 ; SSE-NEXT: movaps %xmm2, %xmm15
8547 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,2,2,2]
8548 ; SSE-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8549 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
8550 ; SSE-NEXT: movdqa %xmm13, %xmm7
8551 ; SSE-NEXT: punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm12[2],xmm7[3],xmm12[3]
8552 ; SSE-NEXT: shufps {{.*#+}} xmm7 = xmm7[0,1],xmm1[2,3]
8553 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8554 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8555 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
8556 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
8557 ; SSE-NEXT: # xmm8 = mem[3,3,3,3]
8558 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8559 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
8560 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
8561 ; SSE-NEXT: shufps {{.*#+}} xmm8 = xmm8[0,1],xmm1[2,3]
8562 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8563 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8564 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
8565 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
8566 ; SSE-NEXT: # xmm6 = mem[3,3,3,3]
8567 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8568 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
8569 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
8570 ; SSE-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,3]
8571 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8572 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8573 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
8574 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
8575 ; SSE-NEXT: # xmm5 = mem[3,3,3,3]
8576 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8577 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
8578 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
8579 ; SSE-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,1],xmm1[2,3]
8580 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8581 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8582 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
8583 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
8584 ; SSE-NEXT: # xmm4 = mem[3,3,3,3]
8585 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8586 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
8587 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
8588 ; SSE-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm1[2,3]
8589 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8590 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8591 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
8592 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm14[3,3,3,3]
8593 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8594 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
8595 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
8596 ; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
8597 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
8598 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8599 ; SSE-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
8600 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
8601 ; SSE-NEXT: # xmm2 = mem[3,3,3,3]
8602 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
8603 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
8604 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
8605 ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
8606 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8607 ; SSE-NEXT: unpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
8608 ; SSE-NEXT: # xmm12 = xmm12[2],mem[2],xmm12[3],mem[3]
8609 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
8610 ; SSE-NEXT: # xmm1 = mem[3,3,3,3]
8611 ; SSE-NEXT: pshufd $255, (%rsp), %xmm0 # 16-byte Folded Reload
8612 ; SSE-NEXT: # xmm0 = mem[3,3,3,3]
8613 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
8614 ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3]
8615 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8616 ; SSE-NEXT: unpckhps {{.*#+}} xmm12 = xmm12[2],xmm15[2],xmm12[3],xmm15[3]
8617 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm13[3,3,3,3]
8618 ; SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
8619 ; SSE-NEXT: # xmm15 = mem[3,3,3,3]
8620 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1]
8621 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm12[2,3]
8622 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8623 ; SSE-NEXT: movaps %xmm15, 96(%rsi)
8624 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8625 ; SSE-NEXT: movaps %xmm15, 32(%rsi)
8626 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8627 ; SSE-NEXT: movaps %xmm15, 112(%rsi)
8628 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8629 ; SSE-NEXT: movaps %xmm15, 48(%rsi)
8630 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8631 ; SSE-NEXT: movaps %xmm15, 64(%rsi)
8632 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8633 ; SSE-NEXT: movaps %xmm15, (%rsi)
8634 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8635 ; SSE-NEXT: movaps %xmm15, 80(%rsi)
8636 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8637 ; SSE-NEXT: movaps %xmm15, 16(%rsi)
8638 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8639 ; SSE-NEXT: movaps %xmm15, 96(%rdx)
8640 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8641 ; SSE-NEXT: movaps %xmm15, 32(%rdx)
8642 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8643 ; SSE-NEXT: movaps %xmm15, 112(%rdx)
8644 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8645 ; SSE-NEXT: movaps %xmm15, 48(%rdx)
8646 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8647 ; SSE-NEXT: movaps %xmm15, 64(%rdx)
8648 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8649 ; SSE-NEXT: movaps %xmm15, (%rdx)
8650 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8651 ; SSE-NEXT: movaps %xmm15, 80(%rdx)
8652 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8653 ; SSE-NEXT: movaps %xmm15, 16(%rdx)
8654 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8655 ; SSE-NEXT: movaps %xmm15, 96(%rcx)
8656 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8657 ; SSE-NEXT: movaps %xmm15, 32(%rcx)
8658 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8659 ; SSE-NEXT: movaps %xmm15, 112(%rcx)
8660 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8661 ; SSE-NEXT: movaps %xmm15, 48(%rcx)
8662 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8663 ; SSE-NEXT: movaps %xmm15, 64(%rcx)
8664 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8665 ; SSE-NEXT: movaps %xmm15, (%rcx)
8666 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8667 ; SSE-NEXT: movaps %xmm15, 80(%rcx)
8668 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8669 ; SSE-NEXT: movaps %xmm15, 16(%rcx)
8670 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8671 ; SSE-NEXT: movaps %xmm15, 112(%r8)
8672 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8673 ; SSE-NEXT: movaps %xmm15, 96(%r8)
8674 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8675 ; SSE-NEXT: movaps %xmm15, 80(%r8)
8676 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8677 ; SSE-NEXT: movaps %xmm15, 64(%r8)
8678 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8679 ; SSE-NEXT: movaps %xmm15, 48(%r8)
8680 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8681 ; SSE-NEXT: movaps %xmm15, 32(%r8)
8682 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8683 ; SSE-NEXT: movaps %xmm15, 16(%r8)
8684 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8685 ; SSE-NEXT: movaps %xmm15, (%r8)
8686 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8687 ; SSE-NEXT: movaps %xmm15, 112(%r9)
8688 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8689 ; SSE-NEXT: movaps %xmm15, 96(%r9)
8690 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8691 ; SSE-NEXT: movaps %xmm15, 80(%r9)
8692 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8693 ; SSE-NEXT: movaps %xmm15, 64(%r9)
8694 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8695 ; SSE-NEXT: movaps %xmm15, 48(%r9)
8696 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8697 ; SSE-NEXT: movaps %xmm15, 32(%r9)
8698 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8699 ; SSE-NEXT: movaps %xmm15, 16(%r9)
8700 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8701 ; SSE-NEXT: movaps %xmm15, (%r9)
8702 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
8703 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8704 ; SSE-NEXT: movaps %xmm12, 112(%rax)
8705 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8706 ; SSE-NEXT: movaps %xmm12, 96(%rax)
8707 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8708 ; SSE-NEXT: movaps %xmm12, 80(%rax)
8709 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8710 ; SSE-NEXT: movaps %xmm12, 64(%rax)
8711 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8712 ; SSE-NEXT: movaps %xmm12, 48(%rax)
8713 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8714 ; SSE-NEXT: movaps %xmm12, 32(%rax)
8715 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
8716 ; SSE-NEXT: movaps %xmm15, 16(%rax)
8717 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
8718 ; SSE-NEXT: movaps %xmm12, (%rax)
8719 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
8720 ; SSE-NEXT: movapd %xmm9, 112(%rax)
8721 ; SSE-NEXT: movapd %xmm10, 96(%rax)
8722 ; SSE-NEXT: movapd %xmm11, 80(%rax)
8723 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8724 ; SSE-NEXT: movaps %xmm9, 64(%rax)
8725 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8726 ; SSE-NEXT: movaps %xmm9, 48(%rax)
8727 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8728 ; SSE-NEXT: movaps %xmm9, 32(%rax)
8729 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
8730 ; SSE-NEXT: movaps %xmm9, 16(%rax)
8731 ; SSE-NEXT: movaps %xmm7, (%rax)
8732 ; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
8733 ; SSE-NEXT: movaps %xmm1, 112(%rax)
8734 ; SSE-NEXT: movaps %xmm2, 96(%rax)
8735 ; SSE-NEXT: movaps %xmm3, 80(%rax)
8736 ; SSE-NEXT: movaps %xmm4, 64(%rax)
8737 ; SSE-NEXT: movaps %xmm5, 48(%rax)
8738 ; SSE-NEXT: movaps %xmm6, 32(%rax)
8739 ; SSE-NEXT: movaps %xmm8, 16(%rax)
8740 ; SSE-NEXT: movaps %xmm0, (%rax)
8741 ; SSE-NEXT: addq $1800, %rsp # imm = 0x708
8744 ; AVX-LABEL: load_i16_stride8_vf64:
8746 ; AVX-NEXT: subq $2056, %rsp # imm = 0x808
8747 ; AVX-NEXT: vmovdqa 304(%rdi), %xmm0
8748 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8749 ; AVX-NEXT: vmovdqa 288(%rdi), %xmm1
8750 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8751 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8752 ; AVX-NEXT: vmovdqa 272(%rdi), %xmm0
8753 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8754 ; AVX-NEXT: vmovdqa 256(%rdi), %xmm1
8755 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8756 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8757 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm14[0],xmm11[1],xmm14[1]
8758 ; AVX-NEXT: vmovdqa 368(%rdi), %xmm1
8759 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8760 ; AVX-NEXT: vmovdqa 352(%rdi), %xmm2
8761 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8762 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8763 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8764 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
8765 ; AVX-NEXT: vmovdqa 336(%rdi), %xmm2
8766 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8767 ; AVX-NEXT: vmovdqa 320(%rdi), %xmm3
8768 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8769 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm13 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8770 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,1,0,1]
8771 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
8772 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
8773 ; AVX-NEXT: vmovdqa 496(%rdi), %xmm1
8774 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8775 ; AVX-NEXT: vmovdqa 480(%rdi), %xmm2
8776 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8777 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8778 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[0,0,0,0]
8779 ; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8780 ; AVX-NEXT: vmovdqa 464(%rdi), %xmm2
8781 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8782 ; AVX-NEXT: vmovdqa 448(%rdi), %xmm3
8783 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8784 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8785 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm9[0,1,0,1]
8786 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
8787 ; AVX-NEXT: vmovdqa 432(%rdi), %xmm2
8788 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8789 ; AVX-NEXT: vmovdqa 416(%rdi), %xmm3
8790 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8791 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8792 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8793 ; AVX-NEXT: vmovdqa 400(%rdi), %xmm2
8794 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8795 ; AVX-NEXT: vmovdqa 384(%rdi), %xmm3
8796 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8797 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8798 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8799 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
8800 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
8801 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
8802 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
8803 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
8804 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8805 ; AVX-NEXT: vmovdqa 880(%rdi), %xmm0
8806 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8807 ; AVX-NEXT: vmovdqa 864(%rdi), %xmm1
8808 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8809 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8810 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8811 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8812 ; AVX-NEXT: vmovdqa 848(%rdi), %xmm1
8813 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8814 ; AVX-NEXT: vmovdqa 832(%rdi), %xmm2
8815 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8816 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8817 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8818 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
8819 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
8820 ; AVX-NEXT: vmovdqa 816(%rdi), %xmm1
8821 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8822 ; AVX-NEXT: vmovdqa 800(%rdi), %xmm2
8823 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8824 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8825 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8826 ; AVX-NEXT: vmovdqa 784(%rdi), %xmm1
8827 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8828 ; AVX-NEXT: vmovdqa 768(%rdi), %xmm2
8829 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8830 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8831 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm3[0],xmm8[1],xmm3[1]
8832 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
8833 ; AVX-NEXT: vmovdqa 1008(%rdi), %xmm1
8834 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8835 ; AVX-NEXT: vmovdqa 992(%rdi), %xmm2
8836 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8837 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8838 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[0,0,0,0]
8839 ; AVX-NEXT: vmovdqa 976(%rdi), %xmm2
8840 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8841 ; AVX-NEXT: vmovdqa 960(%rdi), %xmm3
8842 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8843 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8844 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8845 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
8846 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
8847 ; AVX-NEXT: vmovdqa 944(%rdi), %xmm2
8848 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8849 ; AVX-NEXT: vmovdqa 928(%rdi), %xmm3
8850 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8851 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8852 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8853 ; AVX-NEXT: vmovdqa 912(%rdi), %xmm2
8854 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8855 ; AVX-NEXT: vmovdqa 896(%rdi), %xmm3
8856 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8857 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8858 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8859 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1]
8860 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
8861 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
8862 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
8863 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
8864 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8865 ; AVX-NEXT: vmovdqa 624(%rdi), %xmm0
8866 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8867 ; AVX-NEXT: vmovdqa 608(%rdi), %xmm1
8868 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8869 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8870 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8871 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8872 ; AVX-NEXT: vmovdqa 592(%rdi), %xmm1
8873 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8874 ; AVX-NEXT: vmovdqa 576(%rdi), %xmm2
8875 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8876 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8877 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8878 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
8879 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
8880 ; AVX-NEXT: vmovdqa 560(%rdi), %xmm1
8881 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8882 ; AVX-NEXT: vmovdqa 544(%rdi), %xmm2
8883 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8884 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8885 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8886 ; AVX-NEXT: vmovdqa 528(%rdi), %xmm1
8887 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8888 ; AVX-NEXT: vmovdqa 512(%rdi), %xmm2
8889 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8890 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8891 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8892 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
8893 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
8894 ; AVX-NEXT: vmovdqa 752(%rdi), %xmm1
8895 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8896 ; AVX-NEXT: vmovdqa 736(%rdi), %xmm2
8897 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8898 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8899 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8900 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
8901 ; AVX-NEXT: vmovdqa 720(%rdi), %xmm2
8902 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8903 ; AVX-NEXT: vmovdqa 704(%rdi), %xmm3
8904 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8905 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8906 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8907 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1]
8908 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
8909 ; AVX-NEXT: vmovdqa 688(%rdi), %xmm2
8910 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8911 ; AVX-NEXT: vmovdqa 672(%rdi), %xmm3
8912 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8913 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8914 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8915 ; AVX-NEXT: vmovdqa 656(%rdi), %xmm2
8916 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8917 ; AVX-NEXT: vmovdqa 640(%rdi), %xmm3
8918 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8919 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
8920 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8921 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
8922 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
8923 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
8924 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
8925 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
8926 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8927 ; AVX-NEXT: vmovdqa 240(%rdi), %xmm0
8928 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8929 ; AVX-NEXT: vmovdqa 224(%rdi), %xmm1
8930 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8931 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8932 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8933 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
8934 ; AVX-NEXT: vmovdqa 208(%rdi), %xmm1
8935 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8936 ; AVX-NEXT: vmovdqa 192(%rdi), %xmm2
8937 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8938 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8939 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1]
8940 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8941 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
8942 ; AVX-NEXT: vmovdqa 176(%rdi), %xmm1
8943 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8944 ; AVX-NEXT: vmovdqa 160(%rdi), %xmm2
8945 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8946 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8947 ; AVX-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill
8948 ; AVX-NEXT: vmovdqa 144(%rdi), %xmm1
8949 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8950 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm2
8951 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8952 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
8953 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8954 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
8955 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
8956 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
8957 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7]
8958 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm0
8959 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8960 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm1
8961 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8962 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
8963 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8964 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
8965 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm0
8966 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8967 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm2
8968 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8969 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
8970 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8971 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
8972 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5],xmm1[6,7]
8973 ; AVX-NEXT: vmovdqa (%rdi), %xmm1
8974 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8975 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm2
8976 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8977 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm6
8978 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8979 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm7
8980 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8981 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
8982 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8983 ; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
8984 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
8985 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8986 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4,5,6,7]
8987 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
8988 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
8989 ; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8990 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[1,1,1,1]
8991 ; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8992 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm14[2,3],xmm0[4,5,6,7]
8993 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
8994 ; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8995 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm13[0],xmm5[0],xmm13[1],xmm5[1]
8996 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
8997 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
8998 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
8999 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9000 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9001 ; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm10[1,1,1,1]
9002 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9003 ; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm15[0],xmm9[1],xmm15[2,3]
9004 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
9005 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7]
9006 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
9007 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9008 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9009 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[1,1,1,1]
9010 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9011 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7]
9012 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9013 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9014 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
9015 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
9016 ; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9017 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9018 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm12[0],xmm1[1],xmm12[1]
9019 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9020 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9021 ; AVX-NEXT: # xmm15 = mem[1,1,1,1]
9022 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
9023 ; AVX-NEXT: # xmm15 = xmm15[0],mem[1],xmm15[2,3]
9024 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
9025 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7]
9026 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
9027 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9028 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9029 ; AVX-NEXT: # xmm0 = mem[1,1,1,1]
9030 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9031 ; AVX-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
9032 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9033 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
9034 ; AVX-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1]
9035 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3]
9036 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9037 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
9038 ; AVX-NEXT: # xmm3 = xmm1[0],mem[0],xmm1[1],mem[1]
9039 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9040 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9041 ; AVX-NEXT: # xmm15 = mem[1,1,1,1]
9042 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
9043 ; AVX-NEXT: # xmm15 = xmm15[0],mem[1],xmm15[2,3]
9044 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
9045 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5],ymm3[6,7]
9046 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
9047 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9048 ; AVX-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload
9049 ; AVX-NEXT: # xmm0 = xmm4[0],mem[0],xmm4[1],mem[1]
9050 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
9051 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9052 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm4[1,1,1,1]
9053 ; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload
9054 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm1[1],xmm3[2,3]
9055 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9056 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
9057 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,1,1]
9058 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
9059 ; AVX-NEXT: # xmm3 = xmm3[0,1],mem[2,3],xmm3[4,5,6,7]
9060 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9061 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9062 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1]
9063 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm15[4,5,6,7]
9064 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
9065 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9066 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
9067 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0,1,2,3,4,5],xmm0[6,7]
9068 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm11[2],xmm14[2],xmm11[3],xmm14[3]
9069 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
9070 ; AVX-NEXT: vunpckhps {{.*#+}} xmm3 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
9071 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9072 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9073 ; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm9[2,2,2,2]
9074 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9075 ; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm14[0,1,2],xmm15[3]
9076 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
9077 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7]
9078 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
9079 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9080 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,2,2,2]
9081 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5],xmm0[6,7]
9082 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9083 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm3[2],xmm8[2],xmm3[3],xmm8[3]
9084 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
9085 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9086 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9087 ; AVX-NEXT: vunpckhps {{.*#+}} xmm3 = xmm13[2],xmm11[2],xmm13[3],xmm11[3]
9088 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9089 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[2,2,2,2]
9090 ; AVX-NEXT: vpblendw $63, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
9091 ; AVX-NEXT: # xmm15 = mem[0,1,2,3,4,5],xmm15[6,7]
9092 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
9093 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7]
9094 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
9095 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9096 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9097 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,2,2,2]
9098 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9099 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
9100 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9101 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9102 ; AVX-NEXT: vunpckhps {{.*#+}} xmm3 = xmm10[2],xmm5[2],xmm10[3],xmm5[3]
9103 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
9104 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9105 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
9106 ; AVX-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
9107 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9108 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9109 ; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm12[2,2,2,2]
9110 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9111 ; AVX-NEXT: vblendps {{.*#+}} xmm15 = xmm8[0,1,2],xmm15[3]
9112 ; AVX-NEXT: vinsertf128 $1, %xmm15, %ymm0, %ymm15
9113 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm15[6,7]
9114 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
9115 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9116 ; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm4[2],xmm1[2],xmm4[3],xmm1[3]
9117 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
9118 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9119 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm4[2,2,2,2]
9120 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9121 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm1[0,1,2],xmm3[3]
9122 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9123 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
9124 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[2,2,2,2]
9125 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9126 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0,1,2,3,4,5],xmm3[6,7]
9127 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9128 ; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm15 # 16-byte Folded Reload
9129 ; AVX-NEXT: # xmm15 = xmm15[2],mem[2],xmm15[3],mem[3]
9130 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm15[0,1,2,3],xmm3[4,5,6,7]
9131 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
9132 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9133 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9134 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9135 ; AVX-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9136 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9137 ; AVX-NEXT: # xmm3 = mem[2,3,2,3]
9138 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9139 ; AVX-NEXT: # xmm15 = mem[3,3,3,3]
9140 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm15[0],xmm3[1],xmm15[2,3]
9141 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
9142 ; AVX-NEXT: vunpckhps {{.*#+}} xmm3 = xmm14[2],xmm9[2],xmm14[3],xmm9[3]
9143 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9144 ; AVX-NEXT: # xmm15 = mem[2,3,2,3]
9145 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9146 ; AVX-NEXT: # xmm14 = mem[3,3,3,3]
9147 ; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm15[1],xmm14[2,3]
9148 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9149 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
9150 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5],ymm3[6,7]
9151 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
9152 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9153 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9154 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9155 ; AVX-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9156 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9157 ; AVX-NEXT: # xmm3 = mem[2,3,2,3]
9158 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9159 ; AVX-NEXT: # xmm14 = mem[3,3,3,3]
9160 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0],xmm3[1],xmm14[2,3]
9161 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
9162 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9163 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
9164 ; AVX-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
9165 ; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm11[2,3,2,3]
9166 ; AVX-NEXT: vshufps {{.*#+}} xmm15 = xmm13[3,3,3,3]
9167 ; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3]
9168 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9169 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
9170 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm14[0,1,2,3,4,5],ymm3[6,7]
9171 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
9172 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9173 ; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
9174 ; AVX-NEXT: vshufps {{.*#+}} xmm3 = xmm5[2,3,2,3]
9175 ; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm10[3,3,3,3]
9176 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm14[0],xmm3[1],xmm14[2,3]
9177 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm3[0,1],xmm0[2,3]
9178 ; AVX-NEXT: vunpckhps {{.*#+}} xmm3 = xmm8[2],xmm12[2],xmm8[3],xmm12[3]
9179 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
9180 ; AVX-NEXT: # xmm11 = mem[2,3,2,3]
9181 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
9182 ; AVX-NEXT: # xmm10 = mem[3,3,3,3]
9183 ; AVX-NEXT: vblendps {{.*#+}} xmm10 = xmm10[0],xmm11[1],xmm10[2,3]
9184 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9185 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
9186 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7]
9187 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
9188 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9189 ; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
9190 ; AVX-NEXT: vpermilps $238, (%rsp), %xmm3 # 16-byte Folded Reload
9191 ; AVX-NEXT: # xmm3 = mem[2,3,2,3]
9192 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9193 ; AVX-NEXT: # xmm6 = mem[3,3,3,3]
9194 ; AVX-NEXT: vblendps {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3]
9195 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
9196 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9197 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
9198 ; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm3 # 16-byte Folded Reload
9199 ; AVX-NEXT: # xmm3 = xmm2[2],mem[2],xmm2[3],mem[3]
9200 ; AVX-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
9201 ; AVX-NEXT: # xmm2 = mem[2,3,2,3]
9202 ; AVX-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9203 ; AVX-NEXT: # xmm1 = mem[3,3,3,3]
9204 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5,6,7]
9205 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
9206 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
9207 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9208 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9209 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
9210 ; AVX-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9211 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9212 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm10 # 16-byte Folded Reload
9213 ; AVX-NEXT: # xmm10 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9214 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9215 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
9216 ; AVX-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9217 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9218 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm7 # 16-byte Folded Reload
9219 ; AVX-NEXT: # xmm7 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9220 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,0,0,0]
9221 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,1,0,1]
9222 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
9223 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm8[0],xmm10[1],xmm8[1]
9224 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9225 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
9226 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9227 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
9228 ; AVX-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9229 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9230 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9231 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm15 # 16-byte Folded Reload
9232 ; AVX-NEXT: # xmm15 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9233 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9234 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm11 # 16-byte Folded Reload
9235 ; AVX-NEXT: # xmm11 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9236 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9237 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
9238 ; AVX-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9239 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9240 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[0,0,0,0]
9241 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1]
9242 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
9243 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm3[0],xmm15[1],xmm3[1]
9244 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
9245 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9246 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
9247 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9248 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9249 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9250 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
9251 ; AVX-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9252 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9253 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9254 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
9255 ; AVX-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9256 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9257 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
9258 ; AVX-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9259 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9260 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9261 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
9262 ; AVX-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9263 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9264 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
9265 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1]
9266 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
9267 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
9268 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9269 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
9270 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9271 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
9272 ; AVX-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9273 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9274 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9275 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
9276 ; AVX-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9277 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9278 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9279 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
9280 ; AVX-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9281 ; AVX-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
9282 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9283 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm5 # 16-byte Folded Reload
9284 ; AVX-NEXT: # xmm5 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
9285 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9286 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,0,0,0]
9287 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,1,0,1]
9288 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
9289 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
9290 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
9291 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9292 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
9293 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9294 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9295 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9296 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
9297 ; AVX-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9298 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9299 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9300 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
9301 ; AVX-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9302 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9303 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9304 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
9305 ; AVX-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9306 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9307 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9308 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
9309 ; AVX-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9310 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9311 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,0,0,0]
9312 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,1,0,1]
9313 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
9314 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
9315 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm0[4,5,6,7]
9316 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9317 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
9318 ; AVX-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9319 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9320 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9321 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
9322 ; AVX-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9323 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9324 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9325 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload
9326 ; AVX-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9327 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9328 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9329 ; AVX-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9330 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9331 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm13[0,0,0,0]
9332 ; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9333 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[0,1,0,1]
9334 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm5[0,1,2,3,4,5],xmm3[6,7]
9335 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
9336 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
9337 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9338 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
9339 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm3[4,5,6,7]
9340 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9341 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9342 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
9343 ; AVX-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9344 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9345 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9346 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
9347 ; AVX-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9348 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9349 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
9350 ; AVX-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9351 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9352 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9353 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9354 ; AVX-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9355 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9356 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
9357 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
9358 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4,5],xmm2[6,7]
9359 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm1[0],xmm12[1],xmm1[1]
9360 ; AVX-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9361 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm3
9362 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm2
9363 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6,7]
9364 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9365 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
9366 ; AVX-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9367 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9368 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9369 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9370 ; AVX-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
9371 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9372 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
9373 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
9374 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5],xmm2[6,7]
9375 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9376 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
9377 ; AVX-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
9378 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9379 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9380 ; AVX-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
9381 ; AVX-NEXT: # xmm3 = xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
9382 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9383 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
9384 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm14[0,1,2,3],xmm0[4,5,6,7]
9385 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9386 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9387 ; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9388 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[1,1,1,1]
9389 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm8[2,3],xmm0[4,5,6,7]
9390 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9391 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9392 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
9393 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
9394 ; AVX-NEXT: vmovdqa %xmm11, %xmm8
9395 ; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9396 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9397 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm11[0],xmm8[0],xmm11[1],xmm8[1]
9398 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9399 ; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9400 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[1,1,1,1]
9401 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9402 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm14[0,1],xmm3[2,3],xmm14[4,5,6,7]
9403 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
9404 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7]
9405 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9406 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9407 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[1,1,1,1]
9408 ; AVX-NEXT: vpblendw $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9409 ; AVX-NEXT: # xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6,7]
9410 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9411 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9412 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
9413 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
9414 ; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload
9415 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9416 ; AVX-NEXT: vunpcklps {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
9417 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9418 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9419 ; AVX-NEXT: # xmm14 = mem[1,1,1,1]
9420 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9421 ; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm14[0],xmm2[1],xmm14[2,3]
9422 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
9423 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7]
9424 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9425 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9426 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9427 ; AVX-NEXT: # xmm0 = mem[1,1,1,1]
9428 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9429 ; AVX-NEXT: # xmm0 = xmm0[0],mem[1],xmm0[2,3]
9430 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9431 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
9432 ; AVX-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
9433 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
9434 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9435 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm13[0],xmm1[1],xmm13[1]
9436 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9437 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9438 ; AVX-NEXT: # xmm14 = mem[1,1,1,1]
9439 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
9440 ; AVX-NEXT: # xmm14 = xmm14[0],mem[1],xmm14[2,3]
9441 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
9442 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7]
9443 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9444 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9445 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9446 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9447 ; AVX-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
9448 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
9449 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[1,1,1,1]
9450 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9451 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3],xmm1[4,5,6,7]
9452 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9453 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
9454 ; AVX-NEXT: vpermilps $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9455 ; AVX-NEXT: # xmm1 = mem[1,1,1,1]
9456 ; AVX-NEXT: vblendps $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
9457 ; AVX-NEXT: # xmm1 = xmm1[0],mem[1],xmm1[2,3]
9458 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9459 ; AVX-NEXT: vunpcklps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
9460 ; AVX-NEXT: # xmm14 = xmm14[0],mem[0],xmm14[1],mem[1]
9461 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
9462 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
9463 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9464 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[2,2,2,2]
9465 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm7[0,1,2,3,4,5],xmm0[6,7]
9466 ; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm1 # 16-byte Folded Reload
9467 ; AVX-NEXT: # xmm1 = xmm10[2],mem[2],xmm10[3],mem[3]
9468 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
9469 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm15[2],xmm3[2],xmm15[3],xmm3[3]
9470 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9471 ; AVX-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[2,2,2,2]
9472 ; AVX-NEXT: vpblendw {{.*#+}} xmm14 = xmm11[0,1,2,3,4,5],xmm14[6,7]
9473 ; AVX-NEXT: vmovdqa %xmm11, %xmm15
9474 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
9475 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7]
9476 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9477 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9478 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
9479 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3,4,5],xmm0[6,7]
9480 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9481 ; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
9482 ; AVX-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
9483 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
9484 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9485 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
9486 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9487 ; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm6[2,2,2,2]
9488 ; AVX-NEXT: vblendps $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm14 # 16-byte Folded Reload
9489 ; AVX-NEXT: # xmm14 = mem[0,1,2],xmm14[3]
9490 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
9491 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7]
9492 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9493 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9494 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9495 ; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm7[2,2,2,2]
9496 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9497 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
9498 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9499 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9500 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm8[2],xmm4[2],xmm8[3],xmm4[3]
9501 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
9502 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9503 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9504 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm10[2],xmm9[2],xmm10[3],xmm9[3]
9505 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9506 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9507 ; AVX-NEXT: vshufps {{.*#+}} xmm14 = xmm6[2,2,2,2]
9508 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
9509 ; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm3[0,1,2],xmm14[3]
9510 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
9511 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm14[6,7]
9512 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9513 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9514 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9515 ; AVX-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
9516 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
9517 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm13[2,2,2,2]
9518 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9519 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm11[0,1,2],xmm1[3]
9520 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9521 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5],ymm1[6,7]
9522 ; AVX-NEXT: vpermilps $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9523 ; AVX-NEXT: # xmm1 = mem[2,2,2,2]
9524 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
9525 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm13[0,1,2],xmm1[3]
9526 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9527 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9528 ; AVX-NEXT: vunpckhps {{.*#+}} xmm14 = xmm0[2],xmm12[2],xmm0[3],xmm12[3]
9529 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
9530 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm2[4,5,6,7]
9531 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9532 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
9533 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
9534 ; AVX-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
9535 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
9536 ; AVX-NEXT: # xmm1 = mem[2,3,2,3]
9537 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9538 ; AVX-NEXT: # xmm14 = mem[3,3,3,3]
9539 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm14[0],xmm1[1],xmm14[2,3]
9540 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
9541 ; AVX-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload
9542 ; AVX-NEXT: # xmm1 = xmm15[2],mem[2],xmm15[3],mem[3]
9543 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
9544 ; AVX-NEXT: # xmm14 = mem[2,3,2,3]
9545 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
9546 ; AVX-NEXT: # xmm15 = mem[3,3,3,3]
9547 ; AVX-NEXT: vblendps {{.*#+}} xmm14 = xmm15[0],xmm14[1],xmm15[2,3]
9548 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9549 ; AVX-NEXT: vinsertf128 $1, %xmm14, %ymm0, %ymm14
9550 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5],ymm1[6,7]
9551 ; AVX-NEXT: vblendps {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9552 ; AVX-NEXT: vunpckhps {{.*#+}} xmm0 = xmm5[2],xmm7[2],xmm5[3],xmm7[3]
9553 ; AVX-NEXT: vshufps {{.*#+}} xmm1 = xmm4[2,3,2,3]
9554 ; AVX-NEXT: vshufps {{.*#+}} xmm7 = xmm8[3,3,3,3]
9555 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm7[0],xmm1[1],xmm7[2,3]
9556 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
9557 ; AVX-NEXT: vunpckhps {{.*#+}} xmm1 = xmm3[2],xmm6[2],xmm3[3],xmm6[3]
9558 ; AVX-NEXT: vshufps {{.*#+}} xmm4 = xmm9[2,3,2,3]
9559 ; AVX-NEXT: vshufps {{.*#+}} xmm6 = xmm10[3,3,3,3]
9560 ; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3]
9561 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
9562 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
9563 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
9564 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9565 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9566 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
9567 ; AVX-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
9568 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
9569 ; AVX-NEXT: # xmm4 = mem[2,3,2,3]
9570 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9571 ; AVX-NEXT: # xmm6 = mem[3,3,3,3]
9572 ; AVX-NEXT: vblendps {{.*#+}} xmm4 = xmm6[0],xmm4[1],xmm6[2,3]
9573 ; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm4[0,1],xmm1[2,3]
9574 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
9575 ; AVX-NEXT: vunpckhps (%rsp), %xmm2, %xmm4 # 16-byte Folded Reload
9576 ; AVX-NEXT: # xmm4 = xmm2[2],mem[2],xmm2[3],mem[3]
9577 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9578 ; AVX-NEXT: # xmm6 = mem[2,3,2,3]
9579 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9580 ; AVX-NEXT: # xmm7 = mem[3,3,3,3]
9581 ; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3]
9582 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
9583 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
9584 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
9585 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
9586 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm4 # 16-byte Folded Reload
9587 ; AVX-NEXT: # xmm4 = xmm11[2],mem[2],xmm11[3],mem[3]
9588 ; AVX-NEXT: vpermilps $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
9589 ; AVX-NEXT: # xmm6 = mem[2,3,2,3]
9590 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
9591 ; AVX-NEXT: # xmm7 = mem[3,3,3,3]
9592 ; AVX-NEXT: vblendps {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2,3]
9593 ; AVX-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm4
9594 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
9595 ; AVX-NEXT: vblendps {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
9596 ; AVX-NEXT: vunpckhps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm5 # 16-byte Folded Reload
9597 ; AVX-NEXT: # xmm5 = xmm13[2],mem[2],xmm13[3],mem[3]
9598 ; AVX-NEXT: vshufps {{.*#+}} xmm2 = xmm12[2,3,2,3]
9599 ; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
9600 ; AVX-NEXT: # xmm3 = mem[3,3,3,3]
9601 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3]
9602 ; AVX-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3]
9603 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
9604 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9605 ; AVX-NEXT: vmovaps %ymm3, 64(%rsi)
9606 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9607 ; AVX-NEXT: vmovaps %ymm3, (%rsi)
9608 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9609 ; AVX-NEXT: vmovaps %ymm3, 96(%rsi)
9610 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9611 ; AVX-NEXT: vmovaps %ymm3, 32(%rsi)
9612 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9613 ; AVX-NEXT: vmovaps %ymm3, 64(%rdx)
9614 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9615 ; AVX-NEXT: vmovaps %ymm3, (%rdx)
9616 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9617 ; AVX-NEXT: vmovaps %ymm3, 96(%rdx)
9618 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9619 ; AVX-NEXT: vmovaps %ymm3, 32(%rdx)
9620 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9621 ; AVX-NEXT: vmovaps %ymm3, 64(%rcx)
9622 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9623 ; AVX-NEXT: vmovaps %ymm3, (%rcx)
9624 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9625 ; AVX-NEXT: vmovaps %ymm3, 96(%rcx)
9626 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9627 ; AVX-NEXT: vmovaps %ymm3, 32(%rcx)
9628 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9629 ; AVX-NEXT: vmovaps %ymm3, 64(%r8)
9630 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9631 ; AVX-NEXT: vmovaps %ymm3, (%r8)
9632 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9633 ; AVX-NEXT: vmovaps %ymm3, 96(%r8)
9634 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9635 ; AVX-NEXT: vmovaps %ymm3, 32(%r8)
9636 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9637 ; AVX-NEXT: vmovaps %ymm3, 64(%r9)
9638 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9639 ; AVX-NEXT: vmovaps %ymm3, (%r9)
9640 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9641 ; AVX-NEXT: vmovaps %ymm3, 96(%r9)
9642 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9643 ; AVX-NEXT: vmovaps %ymm3, 32(%r9)
9644 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
9645 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9646 ; AVX-NEXT: vmovaps %ymm3, 64(%rax)
9647 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9648 ; AVX-NEXT: vmovaps %ymm3, (%rax)
9649 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9650 ; AVX-NEXT: vmovaps %ymm3, 96(%rax)
9651 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9652 ; AVX-NEXT: vmovaps %ymm3, 32(%rax)
9653 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
9654 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9655 ; AVX-NEXT: vmovaps %ymm3, 64(%rax)
9656 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9657 ; AVX-NEXT: vmovaps %ymm3, (%rax)
9658 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9659 ; AVX-NEXT: vmovaps %ymm3, 96(%rax)
9660 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
9661 ; AVX-NEXT: vmovaps %ymm3, 32(%rax)
9662 ; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
9663 ; AVX-NEXT: vmovaps %ymm1, 96(%rax)
9664 ; AVX-NEXT: vmovaps %ymm0, 64(%rax)
9665 ; AVX-NEXT: vmovaps %ymm14, 32(%rax)
9666 ; AVX-NEXT: vmovaps %ymm2, (%rax)
9667 ; AVX-NEXT: addq $2056, %rsp # imm = 0x808
9668 ; AVX-NEXT: vzeroupper
9671 ; AVX2-LABEL: load_i16_stride8_vf64:
9673 ; AVX2-NEXT: subq $2408, %rsp # imm = 0x968
9674 ; AVX2-NEXT: vmovdqa 448(%rdi), %ymm2
9675 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9676 ; AVX2-NEXT: vmovdqa 480(%rdi), %ymm3
9677 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9678 ; AVX2-NEXT: vmovdqa 272(%rdi), %xmm0
9679 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9680 ; AVX2-NEXT: vmovdqa 256(%rdi), %xmm1
9681 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9682 ; AVX2-NEXT: vmovdqa 304(%rdi), %xmm4
9683 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9684 ; AVX2-NEXT: vmovdqa 288(%rdi), %xmm5
9685 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9686 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
9687 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9688 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9689 ; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9690 ; AVX2-NEXT: vmovdqa 368(%rdi), %xmm0
9691 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9692 ; AVX2-NEXT: vmovdqa 352(%rdi), %xmm1
9693 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9694 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9695 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9696 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
9697 ; AVX2-NEXT: vmovdqa 336(%rdi), %xmm1
9698 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9699 ; AVX2-NEXT: vmovdqa 320(%rdi), %xmm4
9700 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9701 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
9702 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9703 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
9704 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9705 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
9706 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
9707 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,0,2]
9708 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9709 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
9710 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9711 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
9712 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
9713 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9714 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
9715 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9716 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
9717 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
9718 ; AVX2-NEXT: vmovdqa 384(%rdi), %ymm3
9719 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9720 ; AVX2-NEXT: vmovdqa 416(%rdi), %ymm2
9721 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9722 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
9723 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9724 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
9725 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9726 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
9727 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2]
9728 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9729 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
9730 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9731 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
9732 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
9733 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
9734 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9735 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9736 ; AVX2-NEXT: vmovdqa 880(%rdi), %xmm0
9737 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9738 ; AVX2-NEXT: vmovdqa 864(%rdi), %xmm1
9739 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9740 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9741 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9742 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
9743 ; AVX2-NEXT: vmovdqa 848(%rdi), %xmm1
9744 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9745 ; AVX2-NEXT: vmovdqa 832(%rdi), %xmm2
9746 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9747 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
9748 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9749 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
9750 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9751 ; AVX2-NEXT: vmovdqa 784(%rdi), %xmm1
9752 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9753 ; AVX2-NEXT: vmovdqa 768(%rdi), %xmm2
9754 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9755 ; AVX2-NEXT: vmovdqa 816(%rdi), %xmm3
9756 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9757 ; AVX2-NEXT: vmovdqa 800(%rdi), %xmm4
9758 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9759 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
9760 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9761 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
9762 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9763 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
9764 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
9765 ; AVX2-NEXT: vmovdqa 960(%rdi), %ymm2
9766 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9767 ; AVX2-NEXT: vmovdqa 992(%rdi), %ymm1
9768 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9769 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
9770 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9771 ; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,1,2,0,4,5,6,4]
9772 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
9773 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
9774 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9775 ; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4]
9776 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
9777 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
9778 ; AVX2-NEXT: vmovdqa 896(%rdi), %ymm3
9779 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9780 ; AVX2-NEXT: vmovdqa 928(%rdi), %ymm2
9781 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9782 ; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
9783 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9784 ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7]
9785 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
9786 ; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2]
9787 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9788 ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[0,2,2,3,4,6,6,7]
9789 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
9790 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
9791 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
9792 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9793 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9794 ; AVX2-NEXT: vmovdqa 112(%rdi), %xmm0
9795 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9796 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm1
9797 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9798 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
9799 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9800 ; AVX2-NEXT: vpbroadcastd %xmm0, %xmm0
9801 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm1
9802 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9803 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2
9804 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9805 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
9806 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9807 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
9808 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
9809 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
9810 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9811 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2
9812 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9813 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm3
9814 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9815 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm4
9816 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9817 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
9818 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9819 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
9820 ; AVX2-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
9821 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
9822 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3]
9823 ; AVX2-NEXT: vmovdqa 192(%rdi), %ymm1
9824 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9825 ; AVX2-NEXT: vmovdqa 224(%rdi), %ymm0
9826 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9827 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
9828 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9829 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4]
9830 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
9831 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
9832 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9833 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
9834 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
9835 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5,6],ymm0[7]
9836 ; AVX2-NEXT: vmovdqa 128(%rdi), %ymm0
9837 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9838 ; AVX2-NEXT: vmovdqa 160(%rdi), %ymm1
9839 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9840 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
9841 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9842 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
9843 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm8 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
9844 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
9845 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9846 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
9847 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
9848 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
9849 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
9850 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
9851 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9852 ; AVX2-NEXT: vmovdqa 624(%rdi), %xmm4
9853 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9854 ; AVX2-NEXT: vmovdqa 608(%rdi), %xmm5
9855 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9856 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
9857 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9858 ; AVX2-NEXT: vpbroadcastd %xmm4, %xmm4
9859 ; AVX2-NEXT: vmovdqa 592(%rdi), %xmm5
9860 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9861 ; AVX2-NEXT: vmovdqa 576(%rdi), %xmm8
9862 ; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9863 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
9864 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9865 ; AVX2-NEXT: vpbroadcastd %xmm5, %xmm5
9866 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
9867 ; AVX2-NEXT: vmovdqa 528(%rdi), %xmm5
9868 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9869 ; AVX2-NEXT: vmovdqa 512(%rdi), %xmm8
9870 ; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9871 ; AVX2-NEXT: vmovdqa 560(%rdi), %xmm9
9872 ; AVX2-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9873 ; AVX2-NEXT: vmovdqa 544(%rdi), %xmm12
9874 ; AVX2-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9875 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
9876 ; AVX2-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9877 ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
9878 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9879 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1]
9880 ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm5[0,1],xmm4[2,3]
9881 ; AVX2-NEXT: vmovdqa 704(%rdi), %ymm5
9882 ; AVX2-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9883 ; AVX2-NEXT: vmovdqa 736(%rdi), %ymm4
9884 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9885 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,2]
9886 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9887 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
9888 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9889 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
9890 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,1,0,2]
9891 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9892 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
9893 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9894 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
9895 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0,1,2,3,4,5,6],ymm8[7]
9896 ; AVX2-NEXT: vmovdqa 640(%rdi), %ymm9
9897 ; AVX2-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9898 ; AVX2-NEXT: vmovdqa 672(%rdi), %ymm8
9899 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9900 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm8[0,1,0,2]
9901 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9902 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
9903 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9904 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm14 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
9905 ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm9[0,1,0,2]
9906 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9907 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
9908 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9909 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
9910 ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
9911 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
9912 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
9913 ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9914 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
9915 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,1,1,1]
9916 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
9917 ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm8[1],xmm12[2,3]
9918 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
9919 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
9920 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
9921 ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
9922 ; AVX2-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
9923 ; AVX2-NEXT: # ymm13 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
9924 ; AVX2-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
9925 ; AVX2-NEXT: # ymm14 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
9926 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
9927 ; AVX2-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
9928 ; AVX2-NEXT: # ymm14 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
9929 ; AVX2-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
9930 ; AVX2-NEXT: # ymm15 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
9931 ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
9932 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
9933 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
9934 ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9935 ; AVX2-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
9936 ; AVX2-NEXT: # xmm12 = mem[1,1,1,1]
9937 ; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
9938 ; AVX2-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3]
9939 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
9940 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
9941 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
9942 ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
9943 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
9944 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
9945 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
9946 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
9947 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
9948 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
9949 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7]
9950 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7]
9951 ; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9952 ; AVX2-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload
9953 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,1,1]
9954 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
9955 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm12[1],xmm6[2,3]
9956 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
9957 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
9958 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
9959 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
9960 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
9961 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
9962 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
9963 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
9964 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
9965 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
9966 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
9967 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
9968 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9969 ; AVX2-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
9970 ; AVX2-NEXT: # xmm0 = mem[1,1,1,1]
9971 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
9972 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
9973 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
9974 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
9975 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
9976 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
9977 ; AVX2-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
9978 ; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
9979 ; AVX2-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
9980 ; AVX2-NEXT: # ymm2 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
9981 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
9982 ; AVX2-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
9983 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
9984 ; AVX2-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
9985 ; AVX2-NEXT: # ymm3 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
9986 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
9987 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
9988 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
9989 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9990 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,2,2]
9991 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
9992 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
9993 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
9994 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
9995 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
9996 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
9997 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
9998 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
9999 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10000 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
10001 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
10002 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10003 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10004 ; AVX2-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
10005 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10006 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10007 ; AVX2-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
10008 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10009 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
10010 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
10011 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10012 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10013 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10014 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10015 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
10016 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
10017 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10018 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
10019 ; AVX2-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
10020 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10021 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
10022 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
10023 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10024 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10025 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
10026 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10027 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
10028 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
10029 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10030 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10031 ; AVX2-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
10032 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10033 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10034 ; AVX2-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
10035 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10036 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
10037 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
10038 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10039 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10040 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10041 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10042 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,2,2,2]
10043 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
10044 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
10045 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10046 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
10047 ; AVX2-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7]
10048 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
10049 ; AVX2-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7]
10050 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
10051 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
10052 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10053 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
10054 ; AVX2-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7]
10055 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
10056 ; AVX2-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
10057 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
10058 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
10059 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10060 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10061 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10062 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10063 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,2,2,2]
10064 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10065 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
10066 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10067 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
10068 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
10069 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
10070 ; AVX2-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
10071 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
10072 ; AVX2-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
10073 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
10074 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
10075 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10076 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
10077 ; AVX2-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
10078 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10079 ; AVX2-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
10080 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
10081 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
10082 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
10083 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
10084 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
10085 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10086 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10087 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10088 ; AVX2-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
10089 ; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10090 ; AVX2-NEXT: # xmm1 = mem[2,3,2,3]
10091 ; AVX2-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
10092 ; AVX2-NEXT: # xmm3 = mem[3,3,3,3]
10093 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
10094 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10095 ; AVX2-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
10096 ; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
10097 ; AVX2-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10098 ; AVX2-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
10099 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
10100 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10101 ; AVX2-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
10102 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
10103 ; AVX2-NEXT: # ymm15 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
10104 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7]
10105 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
10106 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10107 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10108 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10109 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10110 ; AVX2-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
10111 ; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10112 ; AVX2-NEXT: # xmm1 = mem[2,3,2,3]
10113 ; AVX2-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
10114 ; AVX2-NEXT: # xmm3 = mem[3,3,3,3]
10115 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
10116 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10117 ; AVX2-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
10118 ; AVX2-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
10119 ; AVX2-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10120 ; AVX2-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
10121 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
10122 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10123 ; AVX2-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
10124 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
10125 ; AVX2-NEXT: # ymm11 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
10126 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5],ymm11[6,7]
10127 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
10128 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10129 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10130 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10131 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10132 ; AVX2-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
10133 ; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10134 ; AVX2-NEXT: # xmm1 = mem[2,3,2,3]
10135 ; AVX2-NEXT: vpshufd $255, (%rsp), %xmm3 # 16-byte Folded Reload
10136 ; AVX2-NEXT: # xmm3 = mem[3,3,3,3]
10137 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
10138 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10139 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
10140 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
10141 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
10142 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
10143 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
10144 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7]
10145 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
10146 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10147 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10148 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload
10149 ; AVX2-NEXT: # xmm0 = xmm5[2],mem[2],xmm5[3],mem[3]
10150 ; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
10151 ; AVX2-NEXT: # xmm1 = mem[2,3,2,3]
10152 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,3,3,3]
10153 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
10154 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10155 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
10156 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
10157 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
10158 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
10159 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
10160 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
10161 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10162 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10163 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10164 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10165 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
10166 ; AVX2-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10167 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10168 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10169 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
10170 ; AVX2-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10171 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10172 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10173 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
10174 ; AVX2-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10175 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10176 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10177 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
10178 ; AVX2-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10179 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10180 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm0
10181 ; AVX2-NEXT: vpbroadcastd %xmm4, %xmm1
10182 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10183 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
10184 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10185 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
10186 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3]
10187 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10188 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10189 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3]
10190 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10191 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
10192 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10193 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
10194 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
10195 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10196 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
10197 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10198 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10199 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3]
10200 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10201 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10202 ; AVX2-NEXT: # ymm3 = mem[0,1,1,3]
10203 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10204 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
10205 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10206 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
10207 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
10208 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10209 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
10210 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10211 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10212 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10213 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10214 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10215 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
10216 ; AVX2-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10217 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10218 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10219 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
10220 ; AVX2-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10221 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10222 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10223 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
10224 ; AVX2-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10225 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10226 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10227 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
10228 ; AVX2-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10229 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10230 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm0
10231 ; AVX2-NEXT: vpbroadcastd %xmm4, %xmm1
10232 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10233 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
10234 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10235 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
10236 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3]
10237 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10238 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10239 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3]
10240 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10241 ; AVX2-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,1,2,0,4,5,6,4]
10242 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
10243 ; AVX2-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4]
10244 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
10245 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10246 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10247 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3]
10248 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10249 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10250 ; AVX2-NEXT: # ymm3 = mem[0,1,1,3]
10251 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10252 ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,2,3,4,6,6,7]
10253 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
10254 ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,2,3,4,6,6,7]
10255 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
10256 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10257 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10258 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10259 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10260 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10261 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
10262 ; AVX2-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10263 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10264 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10265 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
10266 ; AVX2-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10267 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10268 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm0
10269 ; AVX2-NEXT: vpbroadcastd %xmm2, %xmm1
10270 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10271 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10272 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
10273 ; AVX2-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
10274 ; AVX2-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
10275 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10276 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
10277 ; AVX2-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
10278 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10279 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
10280 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10281 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
10282 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3]
10283 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10284 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10285 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3]
10286 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10287 ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[0,1,2,0,4,5,6,4]
10288 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
10289 ; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,2,0,4,5,6,4]
10290 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
10291 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10292 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10293 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3]
10294 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10295 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10296 ; AVX2-NEXT: # ymm3 = mem[0,1,1,3]
10297 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10298 ; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,2,2,3,4,6,6,7]
10299 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
10300 ; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
10301 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
10302 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10303 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10304 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10305 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10306 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10307 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
10308 ; AVX2-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10309 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10310 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10311 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
10312 ; AVX2-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10313 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10314 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10315 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
10316 ; AVX2-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10317 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10318 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10319 ; AVX2-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
10320 ; AVX2-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
10321 ; AVX2-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10322 ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm0
10323 ; AVX2-NEXT: vpbroadcastd %xmm12, %xmm1
10324 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10325 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
10326 ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3]
10327 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
10328 ; AVX2-NEXT: # ymm0 = mem[0,1,1,3]
10329 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10330 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
10331 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3]
10332 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10333 ; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4]
10334 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
10335 ; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
10336 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
10337 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5,6],ymm0[7]
10338 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
10339 ; AVX2-NEXT: # ymm0 = mem[0,1,1,3]
10340 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10341 ; AVX2-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
10342 ; AVX2-NEXT: # ymm15 = mem[0,1,1,3]
10343 ; AVX2-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10344 ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
10345 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
10346 ; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[0,2,2,3,4,6,6,7]
10347 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
10348 ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
10349 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
10350 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
10351 ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10352 ; AVX2-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
10353 ; AVX2-NEXT: # xmm12 = mem[1,1,1,1]
10354 ; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
10355 ; AVX2-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3]
10356 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
10357 ; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
10358 ; AVX2-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1]
10359 ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
10360 ; AVX2-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
10361 ; AVX2-NEXT: # ymm13 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
10362 ; AVX2-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
10363 ; AVX2-NEXT: # ymm14 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
10364 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
10365 ; AVX2-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
10366 ; AVX2-NEXT: # ymm14 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
10367 ; AVX2-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
10368 ; AVX2-NEXT: # ymm15 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
10369 ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
10370 ; AVX2-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
10371 ; AVX2-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
10372 ; AVX2-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10373 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10374 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,1,1,1]
10375 ; AVX2-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
10376 ; AVX2-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3]
10377 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10378 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
10379 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1]
10380 ; AVX2-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
10381 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
10382 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
10383 ; AVX2-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
10384 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
10385 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
10386 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7]
10387 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7]
10388 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
10389 ; AVX2-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10390 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
10391 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[1,1,1,1]
10392 ; AVX2-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload
10393 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3]
10394 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10395 ; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm9 # 16-byte Folded Reload
10396 ; AVX2-NEXT: # xmm9 = xmm11[0],mem[0],xmm11[1],mem[1]
10397 ; AVX2-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3]
10398 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
10399 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
10400 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
10401 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
10402 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
10403 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
10404 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
10405 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
10406 ; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10407 ; AVX2-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
10408 ; AVX2-NEXT: # xmm4 = mem[1,1,1,1]
10409 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
10410 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm13[1],xmm4[2,3]
10411 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10412 ; AVX2-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
10413 ; AVX2-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1]
10414 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
10415 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
10416 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
10417 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
10418 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
10419 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
10420 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
10421 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
10422 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
10423 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10424 ; AVX2-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10425 ; AVX2-NEXT: # xmm0 = mem[2,2,2,2]
10426 ; AVX2-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10427 ; AVX2-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
10428 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10429 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
10430 ; AVX2-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
10431 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10432 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
10433 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
10434 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10435 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10436 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
10437 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10438 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
10439 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
10440 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10441 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10442 ; AVX2-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
10443 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10444 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10445 ; AVX2-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
10446 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10447 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
10448 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
10449 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10450 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10451 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10452 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10453 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,2,2]
10454 ; AVX2-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10455 ; AVX2-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
10456 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload
10457 ; AVX2-NEXT: # xmm1 = xmm14[2],mem[2],xmm14[3],mem[3]
10458 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10459 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
10460 ; AVX2-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
10461 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10462 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10463 ; AVX2-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
10464 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10465 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
10466 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
10467 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10468 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10469 ; AVX2-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
10470 ; AVX2-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10471 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10472 ; AVX2-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
10473 ; AVX2-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10474 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
10475 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
10476 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10477 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10478 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10479 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10480 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10481 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
10482 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
10483 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm12[2],xmm10[2],xmm12[3],xmm10[3]
10484 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3]
10485 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
10486 ; AVX2-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7]
10487 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
10488 ; AVX2-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
10489 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
10490 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
10491 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,6],ymm0[7]
10492 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
10493 ; AVX2-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
10494 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10495 ; AVX2-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
10496 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
10497 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
10498 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7]
10499 ; AVX2-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
10500 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm8[4,5,6,7]
10501 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10502 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
10503 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[2,2,2,2]
10504 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10505 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3]
10506 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
10507 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm10[2],xmm13[2],xmm10[3],xmm13[3]
10508 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm6[2,3]
10509 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
10510 ; AVX2-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7]
10511 ; AVX2-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
10512 ; AVX2-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
10513 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
10514 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm11 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
10515 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm6[7]
10516 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
10517 ; AVX2-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
10518 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
10519 ; AVX2-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7]
10520 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
10521 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
10522 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
10523 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
10524 ; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm0[4,5,6,7]
10525 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10526 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
10527 ; AVX2-NEXT: vpshufd $238, (%rsp), %xmm1 # 16-byte Folded Reload
10528 ; AVX2-NEXT: # xmm1 = mem[2,3,2,3]
10529 ; AVX2-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
10530 ; AVX2-NEXT: # xmm2 = mem[3,3,3,3]
10531 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
10532 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10533 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
10534 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
10535 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10536 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
10537 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
10538 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10539 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10540 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10541 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
10542 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
10543 ; AVX2-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
10544 ; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
10545 ; AVX2-NEXT: # xmm2 = mem[2,3,2,3]
10546 ; AVX2-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
10547 ; AVX2-NEXT: # xmm3 = mem[3,3,3,3]
10548 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3]
10549 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
10550 ; AVX2-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10551 ; AVX2-NEXT: # ymm2 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
10552 ; AVX2-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10553 ; AVX2-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
10554 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
10555 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10556 ; AVX2-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
10557 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
10558 ; AVX2-NEXT: # ymm4 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
10559 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
10560 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
10561 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
10562 ; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm12[2],xmm9[3],xmm12[3]
10563 ; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
10564 ; AVX2-NEXT: # xmm3 = mem[2,3,2,3]
10565 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[3,3,3,3]
10566 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3]
10567 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
10568 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
10569 ; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
10570 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
10571 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
10572 ; AVX2-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
10573 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
10574 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
10575 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
10576 ; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
10577 ; AVX2-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
10578 ; AVX2-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
10579 ; AVX2-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
10580 ; AVX2-NEXT: # xmm4 = mem[2,3,2,3]
10581 ; AVX2-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
10582 ; AVX2-NEXT: # xmm5 = mem[3,3,3,3]
10583 ; AVX2-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
10584 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
10585 ; AVX2-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
10586 ; AVX2-NEXT: # ymm4 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
10587 ; AVX2-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
10588 ; AVX2-NEXT: # ymm5 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
10589 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
10590 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
10591 ; AVX2-NEXT: # ymm5 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
10592 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
10593 ; AVX2-NEXT: # ymm6 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
10594 ; AVX2-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
10595 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
10596 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
10597 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10598 ; AVX2-NEXT: vmovaps %ymm4, 64(%rsi)
10599 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10600 ; AVX2-NEXT: vmovaps %ymm4, (%rsi)
10601 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10602 ; AVX2-NEXT: vmovaps %ymm4, 96(%rsi)
10603 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10604 ; AVX2-NEXT: vmovaps %ymm4, 32(%rsi)
10605 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10606 ; AVX2-NEXT: vmovaps %ymm4, 64(%rdx)
10607 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10608 ; AVX2-NEXT: vmovaps %ymm4, (%rdx)
10609 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10610 ; AVX2-NEXT: vmovaps %ymm4, 96(%rdx)
10611 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10612 ; AVX2-NEXT: vmovaps %ymm4, 32(%rdx)
10613 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10614 ; AVX2-NEXT: vmovaps %ymm4, 64(%rcx)
10615 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10616 ; AVX2-NEXT: vmovaps %ymm4, (%rcx)
10617 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10618 ; AVX2-NEXT: vmovaps %ymm4, 96(%rcx)
10619 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10620 ; AVX2-NEXT: vmovaps %ymm4, 32(%rcx)
10621 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10622 ; AVX2-NEXT: vmovaps %ymm4, 64(%r8)
10623 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10624 ; AVX2-NEXT: vmovaps %ymm4, (%r8)
10625 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10626 ; AVX2-NEXT: vmovaps %ymm4, 96(%r8)
10627 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10628 ; AVX2-NEXT: vmovaps %ymm4, 32(%r8)
10629 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10630 ; AVX2-NEXT: vmovaps %ymm4, 64(%r9)
10631 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10632 ; AVX2-NEXT: vmovaps %ymm4, (%r9)
10633 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10634 ; AVX2-NEXT: vmovaps %ymm4, 96(%r9)
10635 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10636 ; AVX2-NEXT: vmovaps %ymm4, 32(%r9)
10637 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
10638 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10639 ; AVX2-NEXT: vmovaps %ymm4, 64(%rax)
10640 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10641 ; AVX2-NEXT: vmovaps %ymm4, (%rax)
10642 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10643 ; AVX2-NEXT: vmovaps %ymm4, 96(%rax)
10644 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10645 ; AVX2-NEXT: vmovaps %ymm4, 32(%rax)
10646 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
10647 ; AVX2-NEXT: vmovdqa %ymm15, 64(%rax)
10648 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10649 ; AVX2-NEXT: vmovaps %ymm4, (%rax)
10650 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10651 ; AVX2-NEXT: vmovaps %ymm4, 96(%rax)
10652 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
10653 ; AVX2-NEXT: vmovaps %ymm4, 32(%rax)
10654 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
10655 ; AVX2-NEXT: vmovdqa %ymm3, 96(%rax)
10656 ; AVX2-NEXT: vmovdqa %ymm2, 64(%rax)
10657 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rax)
10658 ; AVX2-NEXT: vmovdqa %ymm1, (%rax)
10659 ; AVX2-NEXT: addq $2408, %rsp # imm = 0x968
10660 ; AVX2-NEXT: vzeroupper
10663 ; AVX2-FP-LABEL: load_i16_stride8_vf64:
10664 ; AVX2-FP: # %bb.0:
10665 ; AVX2-FP-NEXT: subq $2408, %rsp # imm = 0x968
10666 ; AVX2-FP-NEXT: vmovdqa 448(%rdi), %ymm2
10667 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10668 ; AVX2-FP-NEXT: vmovdqa 480(%rdi), %ymm3
10669 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10670 ; AVX2-FP-NEXT: vmovdqa 272(%rdi), %xmm0
10671 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10672 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm1
10673 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10674 ; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm4
10675 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10676 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm5
10677 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10678 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
10679 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10680 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
10681 ; AVX2-FP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10682 ; AVX2-FP-NEXT: vmovdqa 368(%rdi), %xmm0
10683 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10684 ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %xmm1
10685 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10686 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
10687 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10688 ; AVX2-FP-NEXT: vpbroadcastd %xmm0, %xmm0
10689 ; AVX2-FP-NEXT: vmovdqa 336(%rdi), %xmm1
10690 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10691 ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm4
10692 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10693 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
10694 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10695 ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %xmm1
10696 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10697 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
10698 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10699 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,0,2]
10700 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10701 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
10702 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10703 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
10704 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
10705 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10706 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
10707 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10708 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
10709 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10710 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %ymm3
10711 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10712 ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %ymm2
10713 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10714 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
10715 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10716 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
10717 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10718 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
10719 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2]
10720 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10721 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
10722 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10723 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
10724 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10725 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10726 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10727 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10728 ; AVX2-FP-NEXT: vmovdqa 880(%rdi), %xmm0
10729 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10730 ; AVX2-FP-NEXT: vmovdqa 864(%rdi), %xmm1
10731 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10732 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
10733 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10734 ; AVX2-FP-NEXT: vpbroadcastd %xmm0, %xmm0
10735 ; AVX2-FP-NEXT: vmovdqa 848(%rdi), %xmm1
10736 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10737 ; AVX2-FP-NEXT: vmovdqa 832(%rdi), %xmm2
10738 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10739 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
10740 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10741 ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %xmm1
10742 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10743 ; AVX2-FP-NEXT: vmovdqa 784(%rdi), %xmm1
10744 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10745 ; AVX2-FP-NEXT: vmovdqa 768(%rdi), %xmm2
10746 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10747 ; AVX2-FP-NEXT: vmovdqa 816(%rdi), %xmm3
10748 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10749 ; AVX2-FP-NEXT: vmovdqa 800(%rdi), %xmm4
10750 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10751 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
10752 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10753 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
10754 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10755 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
10756 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10757 ; AVX2-FP-NEXT: vmovdqa 960(%rdi), %ymm2
10758 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10759 ; AVX2-FP-NEXT: vmovdqa 992(%rdi), %ymm1
10760 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10761 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
10762 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10763 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,1,2,0,4,5,6,4]
10764 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
10765 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
10766 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10767 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4]
10768 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
10769 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10770 ; AVX2-FP-NEXT: vmovdqa 896(%rdi), %ymm3
10771 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10772 ; AVX2-FP-NEXT: vmovdqa 928(%rdi), %ymm2
10773 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10774 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
10775 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10776 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7]
10777 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
10778 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2]
10779 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10780 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[0,2,2,3,4,6,6,7]
10781 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
10782 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10783 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10784 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10785 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10786 ; AVX2-FP-NEXT: vmovdqa 112(%rdi), %xmm0
10787 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10788 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm1
10789 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10790 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
10791 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10792 ; AVX2-FP-NEXT: vpbroadcastd %xmm0, %xmm0
10793 ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm1
10794 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10795 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm2
10796 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10797 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
10798 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10799 ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %xmm1
10800 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
10801 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1
10802 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10803 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm2
10804 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10805 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm3
10806 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10807 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm4
10808 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10809 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
10810 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10811 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
10812 ; AVX2-FP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
10813 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
10814 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3]
10815 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %ymm1
10816 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10817 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %ymm0
10818 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10819 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
10820 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10821 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4]
10822 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
10823 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
10824 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10825 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
10826 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
10827 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5,6],ymm0[7]
10828 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %ymm0
10829 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10830 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %ymm1
10831 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10832 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
10833 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10834 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
10835 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
10836 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
10837 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10838 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
10839 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
10840 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
10841 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
10842 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
10843 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10844 ; AVX2-FP-NEXT: vmovdqa 624(%rdi), %xmm4
10845 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10846 ; AVX2-FP-NEXT: vmovdqa 608(%rdi), %xmm5
10847 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10848 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
10849 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10850 ; AVX2-FP-NEXT: vpbroadcastd %xmm4, %xmm4
10851 ; AVX2-FP-NEXT: vmovdqa 592(%rdi), %xmm5
10852 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10853 ; AVX2-FP-NEXT: vmovdqa 576(%rdi), %xmm8
10854 ; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10855 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
10856 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10857 ; AVX2-FP-NEXT: vpbroadcastd %xmm5, %xmm5
10858 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
10859 ; AVX2-FP-NEXT: vmovdqa 528(%rdi), %xmm5
10860 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10861 ; AVX2-FP-NEXT: vmovdqa 512(%rdi), %xmm8
10862 ; AVX2-FP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10863 ; AVX2-FP-NEXT: vmovdqa 560(%rdi), %xmm9
10864 ; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10865 ; AVX2-FP-NEXT: vmovdqa 544(%rdi), %xmm12
10866 ; AVX2-FP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10867 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
10868 ; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10869 ; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
10870 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10871 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1]
10872 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm5[0,1],xmm4[2,3]
10873 ; AVX2-FP-NEXT: vmovdqa 704(%rdi), %ymm5
10874 ; AVX2-FP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10875 ; AVX2-FP-NEXT: vmovdqa 736(%rdi), %ymm4
10876 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10877 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,2]
10878 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10879 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
10880 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10881 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
10882 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,1,0,2]
10883 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10884 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
10885 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10886 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
10887 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0,1,2,3,4,5,6],ymm8[7]
10888 ; AVX2-FP-NEXT: vmovdqa 640(%rdi), %ymm9
10889 ; AVX2-FP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10890 ; AVX2-FP-NEXT: vmovdqa 672(%rdi), %ymm8
10891 ; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10892 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm8[0,1,0,2]
10893 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10894 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
10895 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10896 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
10897 ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm9[0,1,0,2]
10898 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10899 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
10900 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10901 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
10902 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
10903 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
10904 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
10905 ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10906 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
10907 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,1,1,1]
10908 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
10909 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm8[1],xmm12[2,3]
10910 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
10911 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
10912 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
10913 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
10914 ; AVX2-FP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
10915 ; AVX2-FP-NEXT: # ymm13 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
10916 ; AVX2-FP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
10917 ; AVX2-FP-NEXT: # ymm14 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
10918 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
10919 ; AVX2-FP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
10920 ; AVX2-FP-NEXT: # ymm14 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
10921 ; AVX2-FP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
10922 ; AVX2-FP-NEXT: # ymm15 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
10923 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
10924 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
10925 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
10926 ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10927 ; AVX2-FP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
10928 ; AVX2-FP-NEXT: # xmm12 = mem[1,1,1,1]
10929 ; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
10930 ; AVX2-FP-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3]
10931 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
10932 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
10933 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
10934 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
10935 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
10936 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
10937 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
10938 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
10939 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
10940 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
10941 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7]
10942 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7]
10943 ; AVX2-FP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10944 ; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload
10945 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,1,1]
10946 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
10947 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm12[1],xmm6[2,3]
10948 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
10949 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
10950 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
10951 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
10952 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
10953 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
10954 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
10955 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
10956 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
10957 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
10958 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
10959 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
10960 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10961 ; AVX2-FP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
10962 ; AVX2-FP-NEXT: # xmm0 = mem[1,1,1,1]
10963 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
10964 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
10965 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
10966 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
10967 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
10968 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
10969 ; AVX2-FP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
10970 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
10971 ; AVX2-FP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10972 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
10973 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10974 ; AVX2-FP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10975 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
10976 ; AVX2-FP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10977 ; AVX2-FP-NEXT: # ymm3 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
10978 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
10979 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
10980 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
10981 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10982 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,2,2]
10983 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
10984 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
10985 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
10986 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
10987 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
10988 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10989 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10990 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
10991 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10992 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
10993 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
10994 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
10995 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
10996 ; AVX2-FP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
10997 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
10998 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
10999 ; AVX2-FP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
11000 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11001 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
11002 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
11003 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11004 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11005 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11006 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11007 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
11008 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
11009 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11010 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11011 ; AVX2-FP-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
11012 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11013 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
11014 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
11015 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11016 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11017 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
11018 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11019 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
11020 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
11021 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11022 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11023 ; AVX2-FP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
11024 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11025 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11026 ; AVX2-FP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
11027 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11028 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
11029 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
11030 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11031 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11032 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11033 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11034 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,2,2,2]
11035 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
11036 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
11037 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11038 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
11039 ; AVX2-FP-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7]
11040 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
11041 ; AVX2-FP-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7]
11042 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
11043 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
11044 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11045 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
11046 ; AVX2-FP-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7]
11047 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
11048 ; AVX2-FP-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
11049 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
11050 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
11051 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11052 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11053 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11054 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11055 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,2,2,2]
11056 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11057 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
11058 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11059 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
11060 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
11061 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
11062 ; AVX2-FP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
11063 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
11064 ; AVX2-FP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
11065 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
11066 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
11067 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11068 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
11069 ; AVX2-FP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
11070 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11071 ; AVX2-FP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
11072 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
11073 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
11074 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
11075 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
11076 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
11077 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11078 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11079 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11080 ; AVX2-FP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
11081 ; AVX2-FP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
11082 ; AVX2-FP-NEXT: # xmm1 = mem[2,3,2,3]
11083 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
11084 ; AVX2-FP-NEXT: # xmm3 = mem[3,3,3,3]
11085 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
11086 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11087 ; AVX2-FP-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
11088 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
11089 ; AVX2-FP-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11090 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
11091 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
11092 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11093 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
11094 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
11095 ; AVX2-FP-NEXT: # ymm15 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
11096 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7]
11097 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
11098 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11099 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11100 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11101 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11102 ; AVX2-FP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
11103 ; AVX2-FP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
11104 ; AVX2-FP-NEXT: # xmm1 = mem[2,3,2,3]
11105 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
11106 ; AVX2-FP-NEXT: # xmm3 = mem[3,3,3,3]
11107 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
11108 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11109 ; AVX2-FP-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
11110 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
11111 ; AVX2-FP-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11112 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
11113 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
11114 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11115 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
11116 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
11117 ; AVX2-FP-NEXT: # ymm11 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
11118 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5],ymm11[6,7]
11119 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
11120 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11121 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11122 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11123 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11124 ; AVX2-FP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
11125 ; AVX2-FP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
11126 ; AVX2-FP-NEXT: # xmm1 = mem[2,3,2,3]
11127 ; AVX2-FP-NEXT: vpshufd $255, (%rsp), %xmm3 # 16-byte Folded Reload
11128 ; AVX2-FP-NEXT: # xmm3 = mem[3,3,3,3]
11129 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
11130 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11131 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
11132 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
11133 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
11134 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
11135 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
11136 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7]
11137 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
11138 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11139 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11140 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload
11141 ; AVX2-FP-NEXT: # xmm0 = xmm5[2],mem[2],xmm5[3],mem[3]
11142 ; AVX2-FP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
11143 ; AVX2-FP-NEXT: # xmm1 = mem[2,3,2,3]
11144 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,3,3,3]
11145 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
11146 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11147 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
11148 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
11149 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
11150 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
11151 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
11152 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
11153 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11154 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11155 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11156 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11157 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
11158 ; AVX2-FP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11159 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11160 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11161 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
11162 ; AVX2-FP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11163 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11164 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11165 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
11166 ; AVX2-FP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11167 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11168 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11169 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
11170 ; AVX2-FP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11171 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11172 ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %xmm0
11173 ; AVX2-FP-NEXT: vpbroadcastd %xmm4, %xmm1
11174 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11175 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
11176 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11177 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
11178 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3]
11179 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11180 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11181 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3]
11182 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11183 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
11184 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11185 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
11186 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
11187 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11188 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
11189 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11190 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11191 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3]
11192 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11193 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11194 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,1,3]
11195 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11196 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
11197 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11198 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
11199 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
11200 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11201 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
11202 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11203 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11204 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11205 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11206 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11207 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
11208 ; AVX2-FP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11209 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11210 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11211 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
11212 ; AVX2-FP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11213 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11214 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11215 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
11216 ; AVX2-FP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11217 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11218 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11219 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
11220 ; AVX2-FP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11221 ; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11222 ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %xmm0
11223 ; AVX2-FP-NEXT: vpbroadcastd %xmm4, %xmm1
11224 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11225 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
11226 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11227 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
11228 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3]
11229 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11230 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11231 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3]
11232 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11233 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,1,2,0,4,5,6,4]
11234 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
11235 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4]
11236 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
11237 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11238 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11239 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3]
11240 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11241 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11242 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,1,3]
11243 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11244 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,2,3,4,6,6,7]
11245 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
11246 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,2,3,4,6,6,7]
11247 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
11248 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11249 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11250 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11251 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11252 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11253 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
11254 ; AVX2-FP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11255 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11256 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11257 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
11258 ; AVX2-FP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11259 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11260 ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %xmm0
11261 ; AVX2-FP-NEXT: vpbroadcastd %xmm2, %xmm1
11262 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11263 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11264 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
11265 ; AVX2-FP-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
11266 ; AVX2-FP-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
11267 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11268 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11269 ; AVX2-FP-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
11270 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11271 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
11272 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11273 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
11274 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3]
11275 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11276 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11277 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3]
11278 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11279 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[0,1,2,0,4,5,6,4]
11280 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
11281 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,2,0,4,5,6,4]
11282 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
11283 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11284 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11285 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3]
11286 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11287 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11288 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,1,3]
11289 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11290 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,2,2,3,4,6,6,7]
11291 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
11292 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
11293 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
11294 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11295 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11296 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11297 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11298 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11299 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
11300 ; AVX2-FP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11301 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11302 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11303 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
11304 ; AVX2-FP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11305 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11306 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11307 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
11308 ; AVX2-FP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11309 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11310 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11311 ; AVX2-FP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
11312 ; AVX2-FP-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
11313 ; AVX2-FP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11314 ; AVX2-FP-NEXT: vpbroadcastd %xmm1, %xmm0
11315 ; AVX2-FP-NEXT: vpbroadcastd %xmm12, %xmm1
11316 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11317 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
11318 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3]
11319 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
11320 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,1,3]
11321 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11322 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
11323 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3]
11324 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11325 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4]
11326 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
11327 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
11328 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
11329 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5,6],ymm0[7]
11330 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
11331 ; AVX2-FP-NEXT: # ymm0 = mem[0,1,1,3]
11332 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11333 ; AVX2-FP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
11334 ; AVX2-FP-NEXT: # ymm15 = mem[0,1,1,3]
11335 ; AVX2-FP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11336 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
11337 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
11338 ; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[0,2,2,3,4,6,6,7]
11339 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
11340 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
11341 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
11342 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
11343 ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11344 ; AVX2-FP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
11345 ; AVX2-FP-NEXT: # xmm12 = mem[1,1,1,1]
11346 ; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
11347 ; AVX2-FP-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3]
11348 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
11349 ; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
11350 ; AVX2-FP-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1]
11351 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
11352 ; AVX2-FP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
11353 ; AVX2-FP-NEXT: # ymm13 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
11354 ; AVX2-FP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
11355 ; AVX2-FP-NEXT: # ymm14 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
11356 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
11357 ; AVX2-FP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
11358 ; AVX2-FP-NEXT: # ymm14 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
11359 ; AVX2-FP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
11360 ; AVX2-FP-NEXT: # ymm15 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
11361 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
11362 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
11363 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
11364 ; AVX2-FP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11365 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
11366 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,1,1,1]
11367 ; AVX2-FP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
11368 ; AVX2-FP-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3]
11369 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
11370 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
11371 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1]
11372 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
11373 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
11374 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
11375 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
11376 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
11377 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
11378 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7]
11379 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7]
11380 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
11381 ; AVX2-FP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11382 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
11383 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[1,1,1,1]
11384 ; AVX2-FP-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload
11385 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3]
11386 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
11387 ; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm9 # 16-byte Folded Reload
11388 ; AVX2-FP-NEXT: # xmm9 = xmm11[0],mem[0],xmm11[1],mem[1]
11389 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3]
11390 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
11391 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
11392 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
11393 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
11394 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
11395 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
11396 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
11397 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
11398 ; AVX2-FP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11399 ; AVX2-FP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
11400 ; AVX2-FP-NEXT: # xmm4 = mem[1,1,1,1]
11401 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
11402 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm13[1],xmm4[2,3]
11403 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11404 ; AVX2-FP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
11405 ; AVX2-FP-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1]
11406 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
11407 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
11408 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
11409 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
11410 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
11411 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
11412 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
11413 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
11414 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
11415 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11416 ; AVX2-FP-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
11417 ; AVX2-FP-NEXT: # xmm0 = mem[2,2,2,2]
11418 ; AVX2-FP-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11419 ; AVX2-FP-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
11420 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11421 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
11422 ; AVX2-FP-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
11423 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11424 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
11425 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
11426 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11427 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11428 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
11429 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11430 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
11431 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
11432 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11433 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11434 ; AVX2-FP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
11435 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11436 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11437 ; AVX2-FP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
11438 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11439 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
11440 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
11441 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11442 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11443 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11444 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11445 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,2,2]
11446 ; AVX2-FP-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11447 ; AVX2-FP-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
11448 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload
11449 ; AVX2-FP-NEXT: # xmm1 = xmm14[2],mem[2],xmm14[3],mem[3]
11450 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11451 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
11452 ; AVX2-FP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
11453 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11454 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11455 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
11456 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11457 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
11458 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
11459 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11460 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11461 ; AVX2-FP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
11462 ; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11463 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11464 ; AVX2-FP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
11465 ; AVX2-FP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11466 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
11467 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
11468 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11469 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11470 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11471 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11472 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
11473 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
11474 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
11475 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm12[2],xmm10[2],xmm12[3],xmm10[3]
11476 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3]
11477 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
11478 ; AVX2-FP-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7]
11479 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
11480 ; AVX2-FP-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
11481 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
11482 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
11483 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,6],ymm0[7]
11484 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
11485 ; AVX2-FP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
11486 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11487 ; AVX2-FP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
11488 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
11489 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
11490 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7]
11491 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
11492 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm8[4,5,6,7]
11493 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11494 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
11495 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[2,2,2,2]
11496 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
11497 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3]
11498 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11499 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm10[2],xmm13[2],xmm10[3],xmm13[3]
11500 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm6[2,3]
11501 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
11502 ; AVX2-FP-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7]
11503 ; AVX2-FP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
11504 ; AVX2-FP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
11505 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
11506 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
11507 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm6[7]
11508 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
11509 ; AVX2-FP-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
11510 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
11511 ; AVX2-FP-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7]
11512 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
11513 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
11514 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
11515 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
11516 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm0[4,5,6,7]
11517 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11518 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
11519 ; AVX2-FP-NEXT: vpshufd $238, (%rsp), %xmm1 # 16-byte Folded Reload
11520 ; AVX2-FP-NEXT: # xmm1 = mem[2,3,2,3]
11521 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
11522 ; AVX2-FP-NEXT: # xmm2 = mem[3,3,3,3]
11523 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
11524 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11525 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
11526 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
11527 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11528 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
11529 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
11530 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11531 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11532 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11533 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
11534 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
11535 ; AVX2-FP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
11536 ; AVX2-FP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
11537 ; AVX2-FP-NEXT: # xmm2 = mem[2,3,2,3]
11538 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
11539 ; AVX2-FP-NEXT: # xmm3 = mem[3,3,3,3]
11540 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3]
11541 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
11542 ; AVX2-FP-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11543 ; AVX2-FP-NEXT: # ymm2 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
11544 ; AVX2-FP-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11545 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
11546 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
11547 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11548 ; AVX2-FP-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
11549 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
11550 ; AVX2-FP-NEXT: # ymm4 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
11551 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
11552 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
11553 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
11554 ; AVX2-FP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm12[2],xmm9[3],xmm12[3]
11555 ; AVX2-FP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
11556 ; AVX2-FP-NEXT: # xmm3 = mem[2,3,2,3]
11557 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[3,3,3,3]
11558 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3]
11559 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
11560 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
11561 ; AVX2-FP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
11562 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
11563 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
11564 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
11565 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
11566 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
11567 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
11568 ; AVX2-FP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
11569 ; AVX2-FP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
11570 ; AVX2-FP-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
11571 ; AVX2-FP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
11572 ; AVX2-FP-NEXT: # xmm4 = mem[2,3,2,3]
11573 ; AVX2-FP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
11574 ; AVX2-FP-NEXT: # xmm5 = mem[3,3,3,3]
11575 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
11576 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
11577 ; AVX2-FP-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
11578 ; AVX2-FP-NEXT: # ymm4 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
11579 ; AVX2-FP-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
11580 ; AVX2-FP-NEXT: # ymm5 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
11581 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
11582 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
11583 ; AVX2-FP-NEXT: # ymm5 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
11584 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
11585 ; AVX2-FP-NEXT: # ymm6 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
11586 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
11587 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
11588 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
11589 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11590 ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rsi)
11591 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11592 ; AVX2-FP-NEXT: vmovaps %ymm4, (%rsi)
11593 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11594 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rsi)
11595 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11596 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rsi)
11597 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11598 ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rdx)
11599 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11600 ; AVX2-FP-NEXT: vmovaps %ymm4, (%rdx)
11601 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11602 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rdx)
11603 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11604 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rdx)
11605 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11606 ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rcx)
11607 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11608 ; AVX2-FP-NEXT: vmovaps %ymm4, (%rcx)
11609 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11610 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rcx)
11611 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11612 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rcx)
11613 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11614 ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%r8)
11615 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11616 ; AVX2-FP-NEXT: vmovaps %ymm4, (%r8)
11617 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11618 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%r8)
11619 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11620 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%r8)
11621 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11622 ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%r9)
11623 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11624 ; AVX2-FP-NEXT: vmovaps %ymm4, (%r9)
11625 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11626 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%r9)
11627 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11628 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%r9)
11629 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
11630 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11631 ; AVX2-FP-NEXT: vmovaps %ymm4, 64(%rax)
11632 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11633 ; AVX2-FP-NEXT: vmovaps %ymm4, (%rax)
11634 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11635 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rax)
11636 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11637 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rax)
11638 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
11639 ; AVX2-FP-NEXT: vmovdqa %ymm15, 64(%rax)
11640 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11641 ; AVX2-FP-NEXT: vmovaps %ymm4, (%rax)
11642 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11643 ; AVX2-FP-NEXT: vmovaps %ymm4, 96(%rax)
11644 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
11645 ; AVX2-FP-NEXT: vmovaps %ymm4, 32(%rax)
11646 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax
11647 ; AVX2-FP-NEXT: vmovdqa %ymm3, 96(%rax)
11648 ; AVX2-FP-NEXT: vmovdqa %ymm2, 64(%rax)
11649 ; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rax)
11650 ; AVX2-FP-NEXT: vmovdqa %ymm1, (%rax)
11651 ; AVX2-FP-NEXT: addq $2408, %rsp # imm = 0x968
11652 ; AVX2-FP-NEXT: vzeroupper
11653 ; AVX2-FP-NEXT: retq
11655 ; AVX2-FCP-LABEL: load_i16_stride8_vf64:
11656 ; AVX2-FCP: # %bb.0:
11657 ; AVX2-FCP-NEXT: subq $2408, %rsp # imm = 0x968
11658 ; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm2
11659 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11660 ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm3
11661 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11662 ; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm0
11663 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11664 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm1
11665 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11666 ; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm4
11667 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11668 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm5
11669 ; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11670 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
11671 ; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11672 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
11673 ; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11674 ; AVX2-FCP-NEXT: vmovdqa 368(%rdi), %xmm0
11675 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11676 ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %xmm1
11677 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11678 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
11679 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11680 ; AVX2-FCP-NEXT: vpbroadcastd %xmm0, %xmm0
11681 ; AVX2-FCP-NEXT: vmovdqa 336(%rdi), %xmm1
11682 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11683 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %xmm4
11684 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11685 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
11686 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11687 ; AVX2-FCP-NEXT: vpbroadcastd %xmm1, %xmm1
11688 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11689 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
11690 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11691 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm3[0,1,0,2]
11692 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11693 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
11694 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11695 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
11696 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
11697 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11698 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
11699 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11700 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
11701 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11702 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %ymm3
11703 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11704 ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %ymm2
11705 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11706 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
11707 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11708 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
11709 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11710 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
11711 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2]
11712 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11713 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
11714 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11715 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
11716 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11717 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11718 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11719 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11720 ; AVX2-FCP-NEXT: vmovdqa 880(%rdi), %xmm0
11721 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11722 ; AVX2-FCP-NEXT: vmovdqa 864(%rdi), %xmm1
11723 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11724 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
11725 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11726 ; AVX2-FCP-NEXT: vpbroadcastd %xmm0, %xmm0
11727 ; AVX2-FCP-NEXT: vmovdqa 848(%rdi), %xmm1
11728 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11729 ; AVX2-FCP-NEXT: vmovdqa 832(%rdi), %xmm2
11730 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11731 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
11732 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11733 ; AVX2-FCP-NEXT: vpbroadcastd %xmm1, %xmm1
11734 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11735 ; AVX2-FCP-NEXT: vmovdqa 784(%rdi), %xmm1
11736 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11737 ; AVX2-FCP-NEXT: vmovdqa 768(%rdi), %xmm2
11738 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11739 ; AVX2-FCP-NEXT: vmovdqa 816(%rdi), %xmm3
11740 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11741 ; AVX2-FCP-NEXT: vmovdqa 800(%rdi), %xmm4
11742 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11743 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
11744 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11745 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
11746 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11747 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
11748 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11749 ; AVX2-FCP-NEXT: vmovdqa 960(%rdi), %ymm2
11750 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11751 ; AVX2-FCP-NEXT: vmovdqa 992(%rdi), %ymm1
11752 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11753 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
11754 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11755 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,1,2,0,4,5,6,4]
11756 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
11757 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
11758 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11759 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4]
11760 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
11761 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11762 ; AVX2-FCP-NEXT: vmovdqa 896(%rdi), %ymm3
11763 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11764 ; AVX2-FCP-NEXT: vmovdqa 928(%rdi), %ymm2
11765 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11766 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
11767 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11768 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm2[0,2,2,3,4,6,6,7]
11769 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
11770 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,1,0,2]
11771 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11772 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm3[0,2,2,3,4,6,6,7]
11773 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
11774 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11775 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11776 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11777 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11778 ; AVX2-FCP-NEXT: vmovdqa 112(%rdi), %xmm0
11779 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11780 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %xmm1
11781 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11782 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
11783 ; AVX2-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11784 ; AVX2-FCP-NEXT: vpbroadcastd %xmm0, %xmm0
11785 ; AVX2-FCP-NEXT: vmovdqa 80(%rdi), %xmm1
11786 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11787 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
11788 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11789 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
11790 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11791 ; AVX2-FCP-NEXT: vpbroadcastd %xmm1, %xmm1
11792 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
11793 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1
11794 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11795 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
11796 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11797 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
11798 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11799 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm4
11800 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11801 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
11802 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11803 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
11804 ; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
11805 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
11806 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm1[0,1],xmm0[2,3]
11807 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm1
11808 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11809 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm0
11810 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11811 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
11812 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11813 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4]
11814 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
11815 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
11816 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11817 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
11818 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
11819 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3,4,5,6],ymm0[7]
11820 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %ymm0
11821 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11822 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %ymm1
11823 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11824 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
11825 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11826 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
11827 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
11828 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,2]
11829 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11830 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
11831 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
11832 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5],ymm9[6,7]
11833 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
11834 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
11835 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11836 ; AVX2-FCP-NEXT: vmovdqa 624(%rdi), %xmm4
11837 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11838 ; AVX2-FCP-NEXT: vmovdqa 608(%rdi), %xmm5
11839 ; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11840 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
11841 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11842 ; AVX2-FCP-NEXT: vpbroadcastd %xmm4, %xmm4
11843 ; AVX2-FCP-NEXT: vmovdqa 592(%rdi), %xmm5
11844 ; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11845 ; AVX2-FCP-NEXT: vmovdqa 576(%rdi), %xmm8
11846 ; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11847 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
11848 ; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11849 ; AVX2-FCP-NEXT: vpbroadcastd %xmm5, %xmm5
11850 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
11851 ; AVX2-FCP-NEXT: vmovdqa 528(%rdi), %xmm5
11852 ; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11853 ; AVX2-FCP-NEXT: vmovdqa 512(%rdi), %xmm8
11854 ; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11855 ; AVX2-FCP-NEXT: vmovdqa 560(%rdi), %xmm9
11856 ; AVX2-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11857 ; AVX2-FCP-NEXT: vmovdqa 544(%rdi), %xmm12
11858 ; AVX2-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11859 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3]
11860 ; AVX2-FCP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11861 ; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3]
11862 ; AVX2-FCP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
11863 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1]
11864 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm5[0,1],xmm4[2,3]
11865 ; AVX2-FCP-NEXT: vmovdqa 704(%rdi), %ymm5
11866 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11867 ; AVX2-FCP-NEXT: vmovdqa 736(%rdi), %ymm4
11868 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11869 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,0,2]
11870 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11871 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
11872 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11873 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm4[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
11874 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm5[0,1,0,2]
11875 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11876 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,1,2,0,4,5,6,4]
11877 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11878 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
11879 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm9[0,1,2,3,4,5,6],ymm8[7]
11880 ; AVX2-FCP-NEXT: vmovdqa 640(%rdi), %ymm9
11881 ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11882 ; AVX2-FCP-NEXT: vmovdqa 672(%rdi), %ymm8
11883 ; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11884 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm8[0,1,0,2]
11885 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11886 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
11887 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11888 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
11889 ; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm9[0,1,0,2]
11890 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11891 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[0,2,2,3,4,6,6,7]
11892 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11893 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
11894 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
11895 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
11896 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
11897 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11898 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
11899 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm9[1,1,1,1]
11900 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
11901 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0],xmm8[1],xmm12[2,3]
11902 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
11903 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
11904 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
11905 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
11906 ; AVX2-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
11907 ; AVX2-FCP-NEXT: # ymm13 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
11908 ; AVX2-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
11909 ; AVX2-FCP-NEXT: # ymm14 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
11910 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
11911 ; AVX2-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
11912 ; AVX2-FCP-NEXT: # ymm14 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
11913 ; AVX2-FCP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
11914 ; AVX2-FCP-NEXT: # ymm15 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
11915 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
11916 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
11917 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
11918 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11919 ; AVX2-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
11920 ; AVX2-FCP-NEXT: # xmm12 = mem[1,1,1,1]
11921 ; AVX2-FCP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
11922 ; AVX2-FCP-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3]
11923 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
11924 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
11925 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
11926 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
11927 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
11928 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
11929 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
11930 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
11931 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
11932 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
11933 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm10[6,7]
11934 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm12[0,1,2,3],ymm6[4,5,6,7]
11935 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11936 ; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm13 # 16-byte Reload
11937 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[1,1,1,1]
11938 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
11939 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0],xmm12[1],xmm6[2,3]
11940 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
11941 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
11942 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
11943 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm7[2,3]
11944 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
11945 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
11946 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
11947 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
11948 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
11949 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
11950 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
11951 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
11952 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11953 ; AVX2-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
11954 ; AVX2-FCP-NEXT: # xmm0 = mem[1,1,1,1]
11955 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
11956 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
11957 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
11958 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
11959 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1]
11960 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
11961 ; AVX2-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
11962 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
11963 ; AVX2-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11964 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
11965 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11966 ; AVX2-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11967 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
11968 ; AVX2-FCP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11969 ; AVX2-FCP-NEXT: # ymm3 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
11970 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11971 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11972 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11973 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11974 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm4[2,2,2,2]
11975 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
11976 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm8[2],xmm9[3],xmm8[3]
11977 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
11978 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
11979 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
11980 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11981 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11982 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
11983 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11984 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
11985 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
11986 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
11987 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
11988 ; AVX2-FCP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
11989 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11990 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
11991 ; AVX2-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
11992 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11993 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
11994 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
11995 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
11996 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
11997 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
11998 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
11999 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
12000 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm15[0,1,2],xmm0[3]
12001 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12002 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
12003 ; AVX2-FCP-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
12004 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12005 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
12006 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
12007 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12008 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12009 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
12010 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12011 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
12012 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
12013 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12014 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12015 ; AVX2-FCP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
12016 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12017 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12018 ; AVX2-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
12019 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12020 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
12021 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
12022 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12023 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12024 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12025 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12026 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm10[2,2,2,2]
12027 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
12028 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm13[2],xmm12[2],xmm13[3],xmm12[3]
12029 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12030 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
12031 ; AVX2-FCP-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7]
12032 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
12033 ; AVX2-FCP-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7]
12034 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
12035 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
12036 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12037 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
12038 ; AVX2-FCP-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7]
12039 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
12040 ; AVX2-FCP-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
12041 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
12042 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
12043 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12044 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12045 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12046 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12047 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm6[2,2,2,2]
12048 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12049 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
12050 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
12051 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm6[2],xmm7[2],xmm6[3],xmm7[3]
12052 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
12053 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
12054 ; AVX2-FCP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
12055 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
12056 ; AVX2-FCP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
12057 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
12058 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
12059 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12060 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
12061 ; AVX2-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
12062 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12063 ; AVX2-FCP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
12064 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
12065 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
12066 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
12067 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
12068 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
12069 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12070 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12071 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12072 ; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
12073 ; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
12074 ; AVX2-FCP-NEXT: # xmm1 = mem[2,3,2,3]
12075 ; AVX2-FCP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
12076 ; AVX2-FCP-NEXT: # xmm3 = mem[3,3,3,3]
12077 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
12078 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12079 ; AVX2-FCP-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
12080 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
12081 ; AVX2-FCP-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12082 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
12083 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
12084 ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12085 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
12086 ; AVX2-FCP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
12087 ; AVX2-FCP-NEXT: # ymm15 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
12088 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7]
12089 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
12090 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12091 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12092 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12093 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12094 ; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
12095 ; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
12096 ; AVX2-FCP-NEXT: # xmm1 = mem[2,3,2,3]
12097 ; AVX2-FCP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
12098 ; AVX2-FCP-NEXT: # xmm3 = mem[3,3,3,3]
12099 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
12100 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12101 ; AVX2-FCP-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
12102 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
12103 ; AVX2-FCP-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12104 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
12105 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
12106 ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12107 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
12108 ; AVX2-FCP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
12109 ; AVX2-FCP-NEXT: # ymm11 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
12110 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4],ymm3[5],ymm11[6,7]
12111 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
12112 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12113 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12114 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12115 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12116 ; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
12117 ; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
12118 ; AVX2-FCP-NEXT: # xmm1 = mem[2,3,2,3]
12119 ; AVX2-FCP-NEXT: vpshufd $255, (%rsp), %xmm3 # 16-byte Folded Reload
12120 ; AVX2-FCP-NEXT: # xmm3 = mem[3,3,3,3]
12121 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
12122 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12123 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
12124 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
12125 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
12126 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
12127 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
12128 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7]
12129 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
12130 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12131 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12132 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm0 # 16-byte Folded Reload
12133 ; AVX2-FCP-NEXT: # xmm0 = xmm5[2],mem[2],xmm5[3],mem[3]
12134 ; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
12135 ; AVX2-FCP-NEXT: # xmm1 = mem[2,3,2,3]
12136 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,3,3,3]
12137 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2,3]
12138 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12139 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
12140 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
12141 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
12142 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
12143 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
12144 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
12145 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12146 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12147 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12148 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12149 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
12150 ; AVX2-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12151 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12152 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12153 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
12154 ; AVX2-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12155 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12156 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12157 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12158 ; AVX2-FCP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12159 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12160 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12161 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
12162 ; AVX2-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12163 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12164 ; AVX2-FCP-NEXT: vpbroadcastd %xmm1, %xmm0
12165 ; AVX2-FCP-NEXT: vpbroadcastd %xmm4, %xmm1
12166 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
12167 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
12168 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12169 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
12170 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3]
12171 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12172 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12173 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3]
12174 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12175 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
12176 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12177 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
12178 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
12179 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12180 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
12181 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12182 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12183 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3]
12184 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12185 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12186 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,1,3]
12187 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12188 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
12189 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12190 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
12191 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
12192 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12193 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
12194 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12195 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12196 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12197 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12198 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12199 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
12200 ; AVX2-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12201 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12202 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12203 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
12204 ; AVX2-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12205 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12206 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12207 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12208 ; AVX2-FCP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12209 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12210 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12211 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
12212 ; AVX2-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12213 ; AVX2-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12214 ; AVX2-FCP-NEXT: vpbroadcastd %xmm1, %xmm0
12215 ; AVX2-FCP-NEXT: vpbroadcastd %xmm4, %xmm1
12216 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
12217 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
12218 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12219 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
12220 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3]
12221 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12222 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12223 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3]
12224 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12225 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm1[0,1,2,0,4,5,6,4]
12226 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
12227 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm2[0,1,2,0,4,5,6,4]
12228 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
12229 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12230 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12231 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3]
12232 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12233 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12234 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,1,3]
12235 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12236 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm9 = ymm2[0,2,2,3,4,6,6,7]
12237 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
12238 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm3[0,2,2,3,4,6,6,7]
12239 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
12240 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12241 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12242 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12243 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12244 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12245 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12246 ; AVX2-FCP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12247 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12248 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12249 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
12250 ; AVX2-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12251 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12252 ; AVX2-FCP-NEXT: vpbroadcastd %xmm1, %xmm0
12253 ; AVX2-FCP-NEXT: vpbroadcastd %xmm2, %xmm1
12254 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
12255 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12256 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
12257 ; AVX2-FCP-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
12258 ; AVX2-FCP-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
12259 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12260 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
12261 ; AVX2-FCP-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
12262 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12263 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
12264 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12265 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
12266 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3]
12267 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12268 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12269 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3]
12270 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12271 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[0,1,2,0,4,5,6,4]
12272 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
12273 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,2,0,4,5,6,4]
12274 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
12275 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12276 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12277 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3]
12278 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12279 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12280 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,1,3]
12281 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12282 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm2[0,2,2,3,4,6,6,7]
12283 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm5[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
12284 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
12285 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
12286 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12287 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12288 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12289 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12290 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12291 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
12292 ; AVX2-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12293 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12294 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12295 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
12296 ; AVX2-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12297 ; AVX2-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12298 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12299 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
12300 ; AVX2-FCP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12301 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12302 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12303 ; AVX2-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm12 # 16-byte Folded Reload
12304 ; AVX2-FCP-NEXT: # xmm12 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
12305 ; AVX2-FCP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12306 ; AVX2-FCP-NEXT: vpbroadcastd %xmm1, %xmm0
12307 ; AVX2-FCP-NEXT: vpbroadcastd %xmm12, %xmm1
12308 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
12309 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
12310 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3]
12311 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
12312 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,1,3]
12313 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12314 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
12315 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3]
12316 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12317 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,1,2,0,4,5,6,4]
12318 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
12319 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
12320 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
12321 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3,4,5,6],ymm0[7]
12322 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
12323 ; AVX2-FCP-NEXT: # ymm0 = mem[0,1,1,3]
12324 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12325 ; AVX2-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
12326 ; AVX2-FCP-NEXT: # ymm15 = mem[0,1,1,3]
12327 ; AVX2-FCP-NEXT: vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12328 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,2,2,3,4,6,6,7]
12329 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm14 = ymm1[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
12330 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} ymm0 = ymm15[0,2,2,3,4,6,6,7]
12331 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
12332 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
12333 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
12334 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
12335 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12336 ; AVX2-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
12337 ; AVX2-FCP-NEXT: # xmm12 = mem[1,1,1,1]
12338 ; AVX2-FCP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
12339 ; AVX2-FCP-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3]
12340 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
12341 ; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm13 # 16-byte Folded Reload
12342 ; AVX2-FCP-NEXT: # xmm13 = xmm13[0],mem[0],xmm13[1],mem[1]
12343 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
12344 ; AVX2-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
12345 ; AVX2-FCP-NEXT: # ymm13 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
12346 ; AVX2-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
12347 ; AVX2-FCP-NEXT: # ymm14 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
12348 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
12349 ; AVX2-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
12350 ; AVX2-FCP-NEXT: # ymm14 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
12351 ; AVX2-FCP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
12352 ; AVX2-FCP-NEXT: # ymm15 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
12353 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5],ymm15[6,7]
12354 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
12355 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
12356 ; AVX2-FCP-NEXT: vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12357 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
12358 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm14[1,1,1,1]
12359 ; AVX2-FCP-NEXT: vpblendd $2, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12, %xmm12 # 16-byte Folded Reload
12360 ; AVX2-FCP-NEXT: # xmm12 = xmm12[0],mem[1],xmm12[2,3]
12361 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
12362 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
12363 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1]
12364 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
12365 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
12366 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
12367 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
12368 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
12369 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
12370 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7]
12371 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm10[6,7]
12372 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm12[0,1,2,3],ymm8[4,5,6,7]
12373 ; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12374 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
12375 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm12[1,1,1,1]
12376 ; AVX2-FCP-NEXT: vmovdqa (%rsp), %xmm10 # 16-byte Reload
12377 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0],xmm10[1],xmm8[2,3]
12378 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
12379 ; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm9 # 16-byte Folded Reload
12380 ; AVX2-FCP-NEXT: # xmm9 = xmm11[0],mem[0],xmm11[1],mem[1]
12381 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm8 = xmm8[0,1],xmm9[2,3]
12382 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
12383 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
12384 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
12385 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
12386 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
12387 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5],ymm4[6,7]
12388 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm6[6,7]
12389 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
12390 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12391 ; AVX2-FCP-NEXT: vpshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
12392 ; AVX2-FCP-NEXT: # xmm4 = mem[1,1,1,1]
12393 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
12394 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0],xmm13[1],xmm4[2,3]
12395 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
12396 ; AVX2-FCP-NEXT: vpunpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
12397 ; AVX2-FCP-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1]
12398 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
12399 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
12400 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
12401 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
12402 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
12403 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
12404 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5],ymm0[6,7]
12405 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
12406 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm0[4,5,6,7]
12407 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12408 ; AVX2-FCP-NEXT: vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
12409 ; AVX2-FCP-NEXT: # xmm0 = mem[2,2,2,2]
12410 ; AVX2-FCP-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12411 ; AVX2-FCP-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
12412 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
12413 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
12414 ; AVX2-FCP-NEXT: # xmm1 = xmm1[2],mem[2],xmm1[3],mem[3]
12415 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12416 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
12417 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
12418 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12419 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12420 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
12421 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12422 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
12423 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
12424 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12425 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12426 ; AVX2-FCP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
12427 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12428 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12429 ; AVX2-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
12430 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12431 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
12432 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
12433 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12434 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12435 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12436 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12437 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm15[2,2,2,2]
12438 ; AVX2-FCP-NEXT: vpblendd $7, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12439 ; AVX2-FCP-NEXT: # xmm0 = mem[0,1,2],xmm0[3]
12440 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload
12441 ; AVX2-FCP-NEXT: # xmm1 = xmm14[2],mem[2],xmm14[3],mem[3]
12442 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12443 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
12444 ; AVX2-FCP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
12445 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12446 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12447 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
12448 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12449 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
12450 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
12451 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12452 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12453 ; AVX2-FCP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
12454 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12455 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12456 ; AVX2-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
12457 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12458 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
12459 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
12460 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12461 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12462 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12463 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12464 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
12465 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm14[2,2,2,2]
12466 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm11[0,1,2],xmm0[3]
12467 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm12[2],xmm10[2],xmm12[3],xmm10[3]
12468 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm1[0,1],xmm0[2,3]
12469 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
12470 ; AVX2-FCP-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7]
12471 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
12472 ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
12473 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
12474 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
12475 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3,4,5,6],ymm0[7]
12476 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
12477 ; AVX2-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
12478 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12479 ; AVX2-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
12480 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
12481 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
12482 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4],ymm9[5],ymm11[6,7]
12483 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
12484 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm8[4,5,6,7]
12485 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12486 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
12487 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm12[2,2,2,2]
12488 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
12489 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm9[0,1,2],xmm6[3]
12490 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
12491 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm8 = xmm10[2],xmm13[2],xmm10[3],xmm13[3]
12492 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm6[2,3]
12493 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
12494 ; AVX2-FCP-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7]
12495 ; AVX2-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
12496 ; AVX2-FCP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
12497 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm13[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
12498 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm8[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
12499 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5,6],ymm6[7]
12500 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
12501 ; AVX2-FCP-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
12502 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
12503 ; AVX2-FCP-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7]
12504 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm11[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
12505 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
12506 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
12507 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
12508 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm2[0,1,2,3],ymm0[4,5,6,7]
12509 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12510 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm14[2],xmm0[3],xmm14[3]
12511 ; AVX2-FCP-NEXT: vpshufd $238, (%rsp), %xmm1 # 16-byte Folded Reload
12512 ; AVX2-FCP-NEXT: # xmm1 = mem[2,3,2,3]
12513 ; AVX2-FCP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
12514 ; AVX2-FCP-NEXT: # xmm2 = mem[3,3,3,3]
12515 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3]
12516 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
12517 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
12518 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
12519 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12520 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
12521 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
12522 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12523 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12524 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
12525 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
12526 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12527 ; AVX2-FCP-NEXT: # xmm0 = xmm0[2],mem[2],xmm0[3],mem[3]
12528 ; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
12529 ; AVX2-FCP-NEXT: # xmm2 = mem[2,3,2,3]
12530 ; AVX2-FCP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
12531 ; AVX2-FCP-NEXT: # xmm3 = mem[3,3,3,3]
12532 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0],xmm2[1],xmm3[2,3]
12533 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
12534 ; AVX2-FCP-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12535 ; AVX2-FCP-NEXT: # ymm2 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
12536 ; AVX2-FCP-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12537 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
12538 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
12539 ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12540 ; AVX2-FCP-NEXT: # ymm3 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
12541 ; AVX2-FCP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
12542 ; AVX2-FCP-NEXT: # ymm4 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
12543 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
12544 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
12545 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
12546 ; AVX2-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm12[2],xmm9[3],xmm12[3]
12547 ; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
12548 ; AVX2-FCP-NEXT: # xmm3 = mem[2,3,2,3]
12549 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm10[3,3,3,3]
12550 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3]
12551 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
12552 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
12553 ; AVX2-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
12554 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5,6],ymm3[7]
12555 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
12556 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
12557 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5],ymm5[6,7]
12558 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
12559 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
12560 ; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
12561 ; AVX2-FCP-NEXT: vpunpckhdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
12562 ; AVX2-FCP-NEXT: # xmm3 = xmm3[2],mem[2],xmm3[3],mem[3]
12563 ; AVX2-FCP-NEXT: vpshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
12564 ; AVX2-FCP-NEXT: # xmm4 = mem[2,3,2,3]
12565 ; AVX2-FCP-NEXT: vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
12566 ; AVX2-FCP-NEXT: # xmm5 = mem[3,3,3,3]
12567 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm5[0],xmm4[1],xmm5[2,3]
12568 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
12569 ; AVX2-FCP-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
12570 ; AVX2-FCP-NEXT: # ymm4 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
12571 ; AVX2-FCP-NEXT: vpshufhw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
12572 ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
12573 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5,6],ymm4[7]
12574 ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
12575 ; AVX2-FCP-NEXT: # ymm5 = mem[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
12576 ; AVX2-FCP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
12577 ; AVX2-FCP-NEXT: # ymm6 = mem[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
12578 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5],ymm6[6,7]
12579 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
12580 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4,5,6,7]
12581 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12582 ; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rsi)
12583 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12584 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rsi)
12585 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12586 ; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rsi)
12587 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12588 ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rsi)
12589 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12590 ; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rdx)
12591 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12592 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rdx)
12593 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12594 ; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rdx)
12595 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12596 ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rdx)
12597 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12598 ; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rcx)
12599 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12600 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rcx)
12601 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12602 ; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rcx)
12603 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12604 ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rcx)
12605 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12606 ; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%r8)
12607 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12608 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%r8)
12609 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12610 ; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%r8)
12611 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12612 ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%r8)
12613 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12614 ; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%r9)
12615 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12616 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%r9)
12617 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12618 ; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%r9)
12619 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12620 ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%r9)
12621 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
12622 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12623 ; AVX2-FCP-NEXT: vmovaps %ymm4, 64(%rax)
12624 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12625 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rax)
12626 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12627 ; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rax)
12628 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12629 ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rax)
12630 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
12631 ; AVX2-FCP-NEXT: vmovdqa %ymm15, 64(%rax)
12632 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12633 ; AVX2-FCP-NEXT: vmovaps %ymm4, (%rax)
12634 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12635 ; AVX2-FCP-NEXT: vmovaps %ymm4, 96(%rax)
12636 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
12637 ; AVX2-FCP-NEXT: vmovaps %ymm4, 32(%rax)
12638 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
12639 ; AVX2-FCP-NEXT: vmovdqa %ymm3, 96(%rax)
12640 ; AVX2-FCP-NEXT: vmovdqa %ymm2, 64(%rax)
12641 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rax)
12642 ; AVX2-FCP-NEXT: vmovdqa %ymm1, (%rax)
12643 ; AVX2-FCP-NEXT: addq $2408, %rsp # imm = 0x968
12644 ; AVX2-FCP-NEXT: vzeroupper
12645 ; AVX2-FCP-NEXT: retq
12647 ; AVX512-LABEL: load_i16_stride8_vf64:
12649 ; AVX512-NEXT: subq $2408, %rsp # imm = 0x968
12650 ; AVX512-NEXT: vmovdqa 368(%rdi), %xmm1
12651 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12652 ; AVX512-NEXT: vmovdqa 352(%rdi), %xmm0
12653 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12654 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
12655 ; AVX512-NEXT: vmovdqa 336(%rdi), %xmm2
12656 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12657 ; AVX512-NEXT: vmovdqa 320(%rdi), %xmm1
12658 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12659 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
12660 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,0,0,4]
12661 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
12662 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12663 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
12664 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
12665 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12666 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
12667 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12668 ; AVX512-NEXT: vpermt2d %xmm0, %xmm6, %xmm1
12669 ; AVX512-NEXT: vmovdqa 304(%rdi), %xmm0
12670 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12671 ; AVX512-NEXT: vmovdqa 288(%rdi), %xmm2
12672 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12673 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
12674 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12675 ; AVX512-NEXT: vmovdqa 272(%rdi), %xmm0
12676 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12677 ; AVX512-NEXT: vmovdqa 256(%rdi), %xmm2
12678 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12679 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
12680 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
12681 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm31
12682 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12683 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
12684 ; AVX512-NEXT: vmovdqa 480(%rdi), %ymm1
12685 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12686 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
12687 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12688 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
12689 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12690 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
12691 ; AVX512-NEXT: vmovdqa 448(%rdi), %ymm2
12692 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12693 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
12694 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12695 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
12696 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12697 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
12698 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12699 ; AVX512-NEXT: vmovdqa 416(%rdi), %ymm2
12700 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12701 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2]
12702 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12703 ; AVX512-NEXT: vmovdqa 384(%rdi), %ymm2
12704 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12705 ; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,1,0,2]
12706 ; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12707 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7]
12708 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12709 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
12710 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7]
12711 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12712 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
12713 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12714 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12715 ; AVX512-NEXT: movb $-64, %al
12716 ; AVX512-NEXT: kmovw %eax, %k1
12717 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
12718 ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm2
12719 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12720 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm1
12721 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12722 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
12723 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm3
12724 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12725 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm2
12726 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12727 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
12728 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,2,2]
12729 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
12730 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12731 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
12732 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12733 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
12734 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12735 ; AVX512-NEXT: vpermt2d %xmm1, %xmm6, %xmm2
12736 ; AVX512-NEXT: vmovdqa (%rdi), %xmm1
12737 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12738 ; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3
12739 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12740 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm4
12741 ; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12742 ; AVX512-NEXT: vmovdqa 48(%rdi), %xmm5
12743 ; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12744 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
12745 ; AVX512-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12746 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
12747 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
12748 ; AVX512-NEXT: vmovdqa64 %xmm3, %xmm17
12749 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
12750 ; AVX512-NEXT: vmovdqa 224(%rdi), %ymm2
12751 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12752 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2]
12753 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12754 ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm2
12755 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12756 ; AVX512-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2]
12757 ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12758 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,4]
12759 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12760 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
12761 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,0,4,5,6,4]
12762 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12763 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
12764 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
12765 ; AVX512-NEXT: vmovdqa 160(%rdi), %ymm3
12766 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12767 ; AVX512-NEXT: vpermq {{.*#+}} ymm24 = ymm3[0,1,0,2]
12768 ; AVX512-NEXT: vmovdqa 128(%rdi), %ymm3
12769 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12770 ; AVX512-NEXT: vpermq {{.*#+}} ymm22 = ymm3[0,1,0,2]
12771 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm24[0,2,2,3,4,6,6,7]
12772 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12773 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
12774 ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm22[0,2,2,3,4,6,6,7]
12775 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
12776 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
12777 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
12778 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
12779 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
12780 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12781 ; AVX512-NEXT: vmovdqa 880(%rdi), %xmm1
12782 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12783 ; AVX512-NEXT: vmovdqa 864(%rdi), %xmm0
12784 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12785 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
12786 ; AVX512-NEXT: vmovdqa 848(%rdi), %xmm2
12787 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12788 ; AVX512-NEXT: vmovdqa 832(%rdi), %xmm1
12789 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12790 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
12791 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
12792 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
12793 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12794 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
12795 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12796 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
12797 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12798 ; AVX512-NEXT: vpermt2d %xmm0, %xmm6, %xmm1
12799 ; AVX512-NEXT: vmovdqa %xmm6, %xmm10
12800 ; AVX512-NEXT: vmovdqa 816(%rdi), %xmm0
12801 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12802 ; AVX512-NEXT: vmovdqa 800(%rdi), %xmm2
12803 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12804 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
12805 ; AVX512-NEXT: vmovdqa 784(%rdi), %xmm0
12806 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12807 ; AVX512-NEXT: vmovdqa 768(%rdi), %xmm2
12808 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12809 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
12810 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
12811 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm19
12812 ; AVX512-NEXT: vmovdqa64 %xmm3, %xmm20
12813 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
12814 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
12815 ; AVX512-NEXT: vmovdqa 992(%rdi), %ymm1
12816 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12817 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,0,2]
12818 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12819 ; AVX512-NEXT: vmovdqa 960(%rdi), %ymm1
12820 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12821 ; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,2]
12822 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12823 ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,2,0,4,5,6,4]
12824 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
12825 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,2,0,4,5,6,4]
12826 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
12827 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12828 ; AVX512-NEXT: vmovdqa 928(%rdi), %ymm2
12829 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12830 ; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2]
12831 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12832 ; AVX512-NEXT: vmovdqa 896(%rdi), %ymm2
12833 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12834 ; AVX512-NEXT: vpermq {{.*#+}} ymm18 = ymm2[0,1,0,2]
12835 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
12836 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
12837 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7]
12838 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm9 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
12839 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4],ymm2[5],ymm9[6,7]
12840 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12841 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
12842 ; AVX512-NEXT: vmovdqa 624(%rdi), %xmm2
12843 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12844 ; AVX512-NEXT: vmovdqa 608(%rdi), %xmm1
12845 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12846 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
12847 ; AVX512-NEXT: vmovdqa 592(%rdi), %xmm5
12848 ; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12849 ; AVX512-NEXT: vmovdqa 576(%rdi), %xmm2
12850 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12851 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
12852 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,2,2,2]
12853 ; AVX512-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1,2],xmm9[3]
12854 ; AVX512-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill
12855 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
12856 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
12857 ; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12858 ; AVX512-NEXT: vpermt2d %xmm1, %xmm10, %xmm2
12859 ; AVX512-NEXT: vmovdqa 560(%rdi), %xmm1
12860 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12861 ; AVX512-NEXT: vmovdqa 544(%rdi), %xmm5
12862 ; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12863 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
12864 ; AVX512-NEXT: vmovdqa 528(%rdi), %xmm1
12865 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12866 ; AVX512-NEXT: vmovdqa 512(%rdi), %xmm5
12867 ; AVX512-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
12868 ; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
12869 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm15[0],xmm8[0],xmm15[1],xmm8[1]
12870 ; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm2[2,3]
12871 ; AVX512-NEXT: vmovdqa 736(%rdi), %ymm1
12872 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12873 ; AVX512-NEXT: vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2]
12874 ; AVX512-NEXT: vmovdqa 704(%rdi), %ymm1
12875 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12876 ; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
12877 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12878 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[0,1,2,0,4,5,6,4]
12879 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
12880 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
12881 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
12882 ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5,6],ymm10[7]
12883 ; AVX512-NEXT: vmovdqa 672(%rdi), %ymm5
12884 ; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12885 ; AVX512-NEXT: vpermq {{.*#+}} ymm28 = ymm5[0,1,0,2]
12886 ; AVX512-NEXT: vmovdqa 640(%rdi), %ymm5
12887 ; AVX512-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
12888 ; AVX512-NEXT: vpermq {{.*#+}} ymm30 = ymm5[0,1,0,2]
12889 ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm28[0,2,2,3,4,6,6,7]
12890 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm14 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
12891 ; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm30[0,2,2,3,4,6,6,7]
12892 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
12893 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm14[5],ymm5[6,7]
12894 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm13[6,7]
12895 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
12896 ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0
12897 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12898 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm31[1,1,1,1]
12899 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
12900 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3]
12901 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12902 ; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
12903 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
12904 ; AVX512-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
12905 ; AVX512-NEXT: # ymm5 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
12906 ; AVX512-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
12907 ; AVX512-NEXT: # ymm9 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
12908 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7]
12909 ; AVX512-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
12910 ; AVX512-NEXT: # ymm9 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
12911 ; AVX512-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
12912 ; AVX512-NEXT: # ymm13 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
12913 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5],ymm13[6,7]
12914 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
12915 ; AVX512-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 {%k1}
12916 ; AVX512-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
12917 ; AVX512-NEXT: # ymm5 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
12918 ; AVX512-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
12919 ; AVX512-NEXT: # ymm9 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
12920 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7]
12921 ; AVX512-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
12922 ; AVX512-NEXT: # ymm9 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
12923 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
12924 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5],ymm13[6,7]
12925 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
12926 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm17[1,1,1,1]
12927 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
12928 ; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm13[1],xmm9[2,3]
12929 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
12930 ; AVX512-NEXT: # xmm9 = xmm9[0,1],mem[2,3]
12931 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
12932 ; AVX512-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0
12933 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12934 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm19[1,1,1,1]
12935 ; AVX512-NEXT: vmovdqa64 %xmm20, %xmm9
12936 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3]
12937 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12938 ; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
12939 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
12940 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
12941 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
12942 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
12943 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
12944 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
12945 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
12946 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
12947 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
12948 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,1,1]
12949 ; AVX512-NEXT: vmovdqa %xmm8, %xmm5
12950 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3]
12951 ; AVX512-NEXT: vmovdqa64 %xmm16, %xmm4
12952 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
12953 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
12954 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
12955 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
12956 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
12957 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
12958 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
12959 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12960 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
12961 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
12962 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
12963 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm31[2],xmm14[2],xmm31[3],xmm14[3]
12964 ; AVX512-NEXT: vmovdqa64 %xmm14, %xmm16
12965 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
12966 ; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
12967 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
12968 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12969 ; AVX512-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
12970 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
12971 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm27
12972 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12973 ; AVX512-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
12974 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
12975 ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm26
12976 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12977 ; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
12978 ; AVX512-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
12979 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
12980 ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm23
12981 ; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
12982 ; AVX512-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
12983 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
12984 ; AVX512-NEXT: vmovdqa64 %ymm4, %ymm21
12985 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
12986 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
12987 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
12988 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
12989 ; AVX512-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
12990 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
12991 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm25
12992 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
12993 ; AVX512-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7]
12994 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
12995 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
12996 ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm24[3,1,2,3,7,5,6,7]
12997 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
12998 ; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm22[3,1,2,3,7,5,6,7]
12999 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
13000 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
13001 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13002 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm13[2],xmm17[3],xmm13[3]
13003 ; AVX512-NEXT: vmovdqa64 %xmm13, %xmm22
13004 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
13005 ; AVX512-NEXT: # xmm2 = xmm2[0,1],mem[2,3]
13006 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
13007 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
13008 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13009 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm19[2],xmm20[2],xmm19[3],xmm20[3]
13010 ; AVX512-NEXT: vmovdqa64 %xmm20, %xmm24
13011 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13012 ; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
13013 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20
13014 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
13015 ; AVX512-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7]
13016 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
13017 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
13018 ; AVX512-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
13019 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
13020 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
13021 ; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
13022 ; AVX512-NEXT: # ymm8 = mem[3,1,2,3,7,5,6,7]
13023 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
13024 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[3,1,2,3,7,5,6,7]
13025 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
13026 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
13027 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13028 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 {%k1}
13029 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm5[2],xmm15[3],xmm5[3]
13030 ; AVX512-NEXT: vmovdqa64 %xmm5, %xmm18
13031 ; AVX512-NEXT: vpblendd $12, (%rsp), %xmm0, %xmm3 # 16-byte Folded Reload
13032 ; AVX512-NEXT: # xmm3 = xmm0[0,1],mem[2,3]
13033 ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[0,1,1,3,4,5,5,7]
13034 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
13035 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
13036 ; AVX512-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
13037 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
13038 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13039 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm28[3,1,2,3,7,5,6,7]
13040 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
13041 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[3,1,2,3,7,5,6,7]
13042 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
13043 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
13044 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
13045 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
13046 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
13047 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13048 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,7,0,0]
13049 ; AVX512-NEXT: vmovdqa64 %xmm31, %xmm1
13050 ; AVX512-NEXT: vpermt2d %xmm16, %xmm0, %xmm1
13051 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm16
13052 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
13053 ; AVX512-NEXT: # xmm0 = xmm1[0,1],mem[2,3]
13054 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
13055 ; AVX512-NEXT: vmovdqa64 %ymm27, %ymm1
13056 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
13057 ; AVX512-NEXT: vmovdqa64 %ymm26, %ymm3
13058 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
13059 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
13060 ; AVX512-NEXT: vmovdqa64 %ymm23, %ymm3
13061 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
13062 ; AVX512-NEXT: vmovdqa64 %ymm21, %ymm13
13063 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
13064 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7]
13065 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
13066 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
13067 ; AVX512-NEXT: vmovdqa64 %xmm17, %xmm1
13068 ; AVX512-NEXT: vpermt2d %xmm22, %xmm16, %xmm1
13069 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13070 ; AVX512-NEXT: # xmm1 = xmm1[0,1],mem[2,3]
13071 ; AVX512-NEXT: vmovdqa64 %ymm25, %ymm3
13072 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
13073 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm13 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
13074 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7]
13075 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
13076 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
13077 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7]
13078 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5],ymm3[6,7]
13079 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
13080 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
13081 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13082 ; AVX512-NEXT: vmovdqa64 %xmm19, %xmm0
13083 ; AVX512-NEXT: vpermt2d %xmm24, %xmm16, %xmm0
13084 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13085 ; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
13086 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
13087 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
13088 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
13089 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
13090 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
13091 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
13092 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
13093 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
13094 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
13095 ; AVX512-NEXT: vpermt2d %xmm18, %xmm16, %xmm15
13096 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload
13097 ; AVX512-NEXT: # xmm1 = xmm15[0,1],mem[2,3]
13098 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
13099 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
13100 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
13101 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
13102 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
13103 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
13104 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
13105 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
13106 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
13107 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13108 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13109 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
13110 ; AVX512-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
13111 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13112 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
13113 ; AVX512-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
13114 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
13115 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
13116 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13117 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
13118 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13119 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
13120 ; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13121 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,0,0,4]
13122 ; AVX512-NEXT: vpermt2d %xmm5, %xmm10, %xmm6
13123 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13124 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
13125 ; AVX512-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
13126 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13127 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13128 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
13129 ; AVX512-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
13130 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13131 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm28
13132 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
13133 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
13134 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
13135 ; AVX512-NEXT: # ymm1 = mem[0,1,1,3]
13136 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13137 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
13138 ; AVX512-NEXT: # ymm2 = mem[0,1,1,3]
13139 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13140 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
13141 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13142 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
13143 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
13144 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13145 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
13146 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13147 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
13148 ; AVX512-NEXT: # ymm2 = mem[0,1,1,3]
13149 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13150 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
13151 ; AVX512-NEXT: # ymm3 = mem[0,1,1,3]
13152 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13153 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
13154 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13155 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
13156 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
13157 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13158 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
13159 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
13160 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13161 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
13162 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13163 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13164 ; AVX512-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
13165 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
13166 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
13167 ; AVX512-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
13168 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,2,2]
13169 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
13170 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13171 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
13172 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13173 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
13174 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13175 ; AVX512-NEXT: vpermt2d %xmm1, %xmm10, %xmm2
13176 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13177 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
13178 ; AVX512-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
13179 ; AVX512-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13180 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13181 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13182 ; AVX512-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
13183 ; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13184 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
13185 ; AVX512-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
13186 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
13187 ; AVX512-NEXT: # ymm2 = mem[0,1,1,3]
13188 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13189 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
13190 ; AVX512-NEXT: # ymm3 = mem[0,1,1,3]
13191 ; AVX512-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13192 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
13193 ; AVX512-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13194 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
13195 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4]
13196 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
13197 ; AVX512-NEXT: vmovdqa64 %ymm4, %ymm17
13198 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
13199 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
13200 ; AVX512-NEXT: # ymm30 = mem[0,1,1,3]
13201 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
13202 ; AVX512-NEXT: # ymm4 = mem[0,1,1,3]
13203 ; AVX512-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13204 ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm30[0,2,2,3,4,6,6,7]
13205 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
13206 ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,2,2,3,4,6,6,7]
13207 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
13208 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
13209 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
13210 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
13211 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
13212 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13213 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13214 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13215 ; AVX512-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
13216 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
13217 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13218 ; AVX512-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
13219 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
13220 ; AVX512-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
13221 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13222 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm23 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
13223 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
13224 ; AVX512-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13225 ; AVX512-NEXT: vpermt2d %xmm0, %xmm10, %xmm1
13226 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13227 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
13228 ; AVX512-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
13229 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13230 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
13231 ; AVX512-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
13232 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
13233 ; AVX512-NEXT: vmovdqa64 %xmm3, %xmm16
13234 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm20
13235 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
13236 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm12
13237 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
13238 ; AVX512-NEXT: # ymm0 = mem[0,1,1,3]
13239 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13240 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
13241 ; AVX512-NEXT: # ymm1 = mem[0,1,1,3]
13242 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13243 ; AVX512-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,2,0,4,5,6,4]
13244 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
13245 ; AVX512-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,2,0,4,5,6,4]
13246 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
13247 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
13248 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload
13249 ; AVX512-NEXT: # ymm22 = mem[0,1,1,3]
13250 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
13251 ; AVX512-NEXT: # ymm29 = mem[0,1,1,3]
13252 ; AVX512-NEXT: vpshufd {{.*#+}} ymm4 = ymm22[0,2,2,3,4,6,6,7]
13253 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
13254 ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[0,2,2,3,4,6,6,7]
13255 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
13256 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5],ymm8[6,7]
13257 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13258 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
13259 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13260 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
13261 ; AVX512-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
13262 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13263 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
13264 ; AVX512-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
13265 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13266 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
13267 ; AVX512-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
13268 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
13269 ; AVX512-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload
13270 ; AVX512-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
13271 ; AVX512-NEXT: vmovdqa %xmm10, %xmm1
13272 ; AVX512-NEXT: vpermi2d %xmm2, %xmm8, %xmm1
13273 ; AVX512-NEXT: vmovdqa64 %xmm8, %xmm19
13274 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm24
13275 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
13276 ; AVX512-NEXT: vmovdqa64 %xmm13, %xmm31
13277 ; AVX512-NEXT: vmovdqa64 %xmm11, %xmm21
13278 ; AVX512-NEXT: vpblendd {{.*#+}} xmm14 = xmm0[0,1],xmm1[2,3]
13279 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
13280 ; AVX512-NEXT: # ymm1 = mem[0,1,1,3]
13281 ; AVX512-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13282 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
13283 ; AVX512-NEXT: # ymm0 = mem[0,1,1,3]
13284 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13285 ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
13286 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
13287 ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,2,0,4,5,6,4]
13288 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
13289 ; AVX512-NEXT: vpblendd {{.*#+}} ymm15 = ymm11[0,1,2,3,4,5,6],ymm8[7]
13290 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
13291 ; AVX512-NEXT: # ymm0 = mem[0,1,1,3]
13292 ; AVX512-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13293 ; AVX512-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
13294 ; AVX512-NEXT: # ymm10 = mem[0,1,1,3]
13295 ; AVX512-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13296 ; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,2,3,4,6,6,7]
13297 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
13298 ; AVX512-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,2,2,3,4,6,6,7]
13299 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
13300 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
13301 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
13302 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
13303 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
13304 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13305 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm28[1,1,1,1]
13306 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
13307 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3]
13308 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13309 ; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
13310 ; AVX512-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
13311 ; AVX512-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
13312 ; AVX512-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
13313 ; AVX512-NEXT: # ymm13 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
13314 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
13315 ; AVX512-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
13316 ; AVX512-NEXT: # ymm13 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
13317 ; AVX512-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
13318 ; AVX512-NEXT: # ymm14 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
13319 ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
13320 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
13321 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
13322 ; AVX512-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 {%k1}
13323 ; AVX512-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
13324 ; AVX512-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
13325 ; AVX512-NEXT: vmovdqa64 %ymm17, %ymm13
13326 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
13327 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
13328 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
13329 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
13330 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7]
13331 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7]
13332 ; AVX512-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload
13333 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm17[1,1,1,1]
13334 ; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
13335 ; AVX512-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm15[1],xmm9[2,3]
13336 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
13337 ; AVX512-NEXT: # xmm9 = xmm9[0,1],mem[2,3]
13338 ; AVX512-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
13339 ; AVX512-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0
13340 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13341 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm16[1,1,1,1]
13342 ; AVX512-NEXT: vmovdqa64 %xmm20, %xmm7
13343 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
13344 ; AVX512-NEXT: vmovdqa64 %xmm23, %xmm9
13345 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3]
13346 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
13347 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
13348 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
13349 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
13350 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
13351 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
13352 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
13353 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
13354 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
13355 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm31[1,1,1,1]
13356 ; AVX512-NEXT: vmovdqa64 %xmm21, %xmm6
13357 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3]
13358 ; AVX512-NEXT: vmovdqa64 %xmm19, %xmm5
13359 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm19[0],xmm24[0],xmm19[1],xmm24[1]
13360 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
13361 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
13362 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
13363 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
13364 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
13365 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
13366 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
13367 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13368 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
13369 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
13370 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13371 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm28[2],xmm10[2],xmm28[3],xmm10[3]
13372 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13373 ; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
13374 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
13375 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
13376 ; AVX512-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
13377 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
13378 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm27
13379 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
13380 ; AVX512-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
13381 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
13382 ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm26
13383 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13384 ; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
13385 ; AVX512-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
13386 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
13387 ; AVX512-NEXT: vmovdqa64 %ymm3, %ymm25
13388 ; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
13389 ; AVX512-NEXT: # ymm13 = mem[3,1,2,3,7,5,6,7]
13390 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
13391 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
13392 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13393 ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
13394 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
13395 ; AVX512-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
13396 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
13397 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm21
13398 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
13399 ; AVX512-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7]
13400 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
13401 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13402 ; AVX512-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[3,1,2,3,7,5,6,7]
13403 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
13404 ; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
13405 ; AVX512-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
13406 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
13407 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
13408 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13409 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm15[2],xmm17[3],xmm15[3]
13410 ; AVX512-NEXT: vmovdqa64 %xmm15, %xmm23
13411 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
13412 ; AVX512-NEXT: # xmm2 = xmm2[0,1],mem[2,3]
13413 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
13414 ; AVX512-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
13415 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13416 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm20[2],xmm16[3],xmm20[3]
13417 ; AVX512-NEXT: vmovdqa64 %xmm16, %xmm19
13418 ; AVX512-NEXT: vmovdqa64 %xmm20, %xmm30
13419 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
13420 ; AVX512-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
13421 ; AVX512-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20
13422 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
13423 ; AVX512-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7]
13424 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
13425 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
13426 ; AVX512-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
13427 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
13428 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
13429 ; AVX512-NEXT: vpshufd {{.*#+}} ymm8 = ymm22[3,1,2,3,7,5,6,7]
13430 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
13431 ; AVX512-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[3,1,2,3,7,5,6,7]
13432 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
13433 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
13434 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13435 ; AVX512-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 {%k1}
13436 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm24[2,2,2,2]
13437 ; AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
13438 ; AVX512-NEXT: vmovdqa64 %xmm5, %xmm22
13439 ; AVX512-NEXT: vmovdqa64 %xmm31, %xmm4
13440 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm6[2],xmm31[3],xmm6[3]
13441 ; AVX512-NEXT: vmovdqa64 %xmm6, %xmm29
13442 ; AVX512-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
13443 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
13444 ; AVX512-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7]
13445 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
13446 ; AVX512-NEXT: vmovdqa64 %ymm0, %ymm18
13447 ; AVX512-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
13448 ; AVX512-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
13449 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
13450 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13451 ; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
13452 ; AVX512-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
13453 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
13454 ; AVX512-NEXT: vmovdqa64 %ymm2, %ymm31
13455 ; AVX512-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
13456 ; AVX512-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
13457 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
13458 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
13459 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
13460 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
13461 ; AVX512-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
13462 ; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0]
13463 ; AVX512-NEXT: vmovdqa64 %xmm28, %xmm1
13464 ; AVX512-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm16, %xmm1 # 16-byte Folded Reload
13465 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
13466 ; AVX512-NEXT: # xmm1 = xmm1[0,1],mem[2,3]
13467 ; AVX512-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
13468 ; AVX512-NEXT: vmovdqa64 %ymm27, %ymm3
13469 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
13470 ; AVX512-NEXT: vmovdqa64 %ymm26, %ymm6
13471 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
13472 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5,6],ymm3[7]
13473 ; AVX512-NEXT: vmovdqa64 %ymm25, %ymm6
13474 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
13475 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
13476 ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
13477 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm3[6,7]
13478 ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1}
13479 ; AVX512-NEXT: vmovdqa64 %xmm17, %xmm3
13480 ; AVX512-NEXT: vpermt2d %xmm23, %xmm16, %xmm3
13481 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
13482 ; AVX512-NEXT: # xmm3 = xmm3[0,1],mem[2,3]
13483 ; AVX512-NEXT: vmovdqa64 %ymm21, %ymm6
13484 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm13 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
13485 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
13486 ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
13487 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
13488 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
13489 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7]
13490 ; AVX512-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7]
13491 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
13492 ; AVX512-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm1
13493 ; AVX512-NEXT: vmovdqa64 %xmm19, %xmm3
13494 ; AVX512-NEXT: vpermt2d %xmm30, %xmm16, %xmm3
13495 ; AVX512-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
13496 ; AVX512-NEXT: # xmm3 = xmm3[0,1],mem[2,3]
13497 ; AVX512-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3
13498 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
13499 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
13500 ; AVX512-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7]
13501 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
13502 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm6 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
13503 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5],ymm6[6,7]
13504 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7]
13505 ; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm3 {%k1}
13506 ; AVX512-NEXT: vpermt2d %xmm29, %xmm16, %xmm4
13507 ; AVX512-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm22[2],xmm24[2],xmm22[3],xmm24[3]
13508 ; AVX512-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3]
13509 ; AVX512-NEXT: vmovdqa64 %ymm18, %ymm7
13510 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
13511 ; AVX512-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
13512 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
13513 ; AVX512-NEXT: vmovdqa64 %ymm31, %ymm4
13514 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
13515 ; AVX512-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
13516 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
13517 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
13518 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
13519 ; AVX512-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
13520 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13521 ; AVX512-NEXT: vmovaps %zmm3, 64(%rsi)
13522 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13523 ; AVX512-NEXT: vmovaps %zmm3, (%rsi)
13524 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13525 ; AVX512-NEXT: vmovaps %zmm3, 64(%rdx)
13526 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13527 ; AVX512-NEXT: vmovaps %zmm3, (%rdx)
13528 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13529 ; AVX512-NEXT: vmovaps %zmm3, 64(%rcx)
13530 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13531 ; AVX512-NEXT: vmovaps %zmm3, (%rcx)
13532 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13533 ; AVX512-NEXT: vmovaps %zmm3, 64(%r8)
13534 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13535 ; AVX512-NEXT: vmovaps %zmm3, (%r8)
13536 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13537 ; AVX512-NEXT: vmovaps %zmm3, 64(%r9)
13538 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13539 ; AVX512-NEXT: vmovaps %zmm3, (%r9)
13540 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
13541 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13542 ; AVX512-NEXT: vmovaps %zmm3, 64(%rax)
13543 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
13544 ; AVX512-NEXT: vmovaps %zmm3, (%rax)
13545 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
13546 ; AVX512-NEXT: vmovdqa64 %zmm0, 64(%rax)
13547 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
13548 ; AVX512-NEXT: vmovaps %zmm0, (%rax)
13549 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
13550 ; AVX512-NEXT: vmovdqa64 %zmm2, 64(%rax)
13551 ; AVX512-NEXT: vmovdqa64 %zmm1, (%rax)
13552 ; AVX512-NEXT: addq $2408, %rsp # imm = 0x968
13553 ; AVX512-NEXT: vzeroupper
13554 ; AVX512-NEXT: retq
13556 ; AVX512-FCP-LABEL: load_i16_stride8_vf64:
13557 ; AVX512-FCP: # %bb.0:
13558 ; AVX512-FCP-NEXT: subq $2312, %rsp # imm = 0x908
13559 ; AVX512-FCP-NEXT: vmovdqa 368(%rdi), %xmm0
13560 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13561 ; AVX512-FCP-NEXT: vmovdqa 352(%rdi), %xmm1
13562 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13563 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
13564 ; AVX512-FCP-NEXT: vmovdqa 336(%rdi), %xmm0
13565 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13566 ; AVX512-FCP-NEXT: vmovdqa 320(%rdi), %xmm1
13567 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13568 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
13569 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4]
13570 ; AVX512-FCP-NEXT: vmovdqa %xmm14, %xmm0
13571 ; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm5, %xmm0
13572 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm24
13573 ; AVX512-FCP-NEXT: vmovdqa 304(%rdi), %xmm1
13574 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13575 ; AVX512-FCP-NEXT: vmovdqa 288(%rdi), %xmm2
13576 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13577 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
13578 ; AVX512-FCP-NEXT: vmovdqa 272(%rdi), %xmm1
13579 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13580 ; AVX512-FCP-NEXT: vmovdqa 256(%rdi), %xmm2
13581 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13582 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
13583 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
13584 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm30
13585 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13586 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
13587 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
13588 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
13589 ; AVX512-FCP-NEXT: vmovdqa 480(%rdi), %ymm1
13590 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13591 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
13592 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13593 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
13594 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13595 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
13596 ; AVX512-FCP-NEXT: vmovdqa 448(%rdi), %ymm2
13597 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13598 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
13599 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13600 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
13601 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13602 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
13603 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13604 ; AVX512-FCP-NEXT: vmovdqa 416(%rdi), %ymm2
13605 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13606 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2]
13607 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13608 ; AVX512-FCP-NEXT: vmovdqa 384(%rdi), %ymm2
13609 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13610 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2]
13611 ; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13612 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7]
13613 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13614 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
13615 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,2,2,3,4,6,6,7]
13616 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13617 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
13618 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
13619 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13620 ; AVX512-FCP-NEXT: movb $-64, %al
13621 ; AVX512-FCP-NEXT: kmovw %eax, %k1
13622 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
13623 ; AVX512-FCP-NEXT: vmovdqa 112(%rdi), %xmm1
13624 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13625 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm2
13626 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13627 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
13628 ; AVX512-FCP-NEXT: vmovdqa 80(%rdi), %xmm1
13629 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13630 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
13631 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13632 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
13633 ; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm1
13634 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm20
13635 ; AVX512-FCP-NEXT: vpermt2d %xmm3, %xmm5, %xmm1
13636 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm19
13637 ; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13638 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm2
13639 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13640 ; AVX512-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
13641 ; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13642 ; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm4
13643 ; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13644 ; AVX512-FCP-NEXT: vmovdqa 48(%rdi), %xmm6
13645 ; AVX512-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13646 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
13647 ; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13648 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
13649 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
13650 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm31
13651 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
13652 ; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm2
13653 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13654 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2]
13655 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13656 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm2
13657 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13658 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2]
13659 ; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13660 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
13661 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
13662 ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm28
13663 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[0,1,2,0,4,5,6,4]
13664 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
13665 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
13666 ; AVX512-FCP-NEXT: vmovdqa 160(%rdi), %ymm3
13667 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13668 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,1,0,2]
13669 ; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13670 ; AVX512-FCP-NEXT: vmovdqa 128(%rdi), %ymm3
13671 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13672 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[0,1,0,2]
13673 ; AVX512-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13674 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,2,2,3,4,6,6,7]
13675 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
13676 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,2,2,3,4,6,6,7]
13677 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
13678 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
13679 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
13680 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
13681 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
13682 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13683 ; AVX512-FCP-NEXT: vmovdqa 880(%rdi), %xmm0
13684 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13685 ; AVX512-FCP-NEXT: vmovdqa 864(%rdi), %xmm1
13686 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13687 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
13688 ; AVX512-FCP-NEXT: vmovdqa 848(%rdi), %xmm0
13689 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13690 ; AVX512-FCP-NEXT: vmovdqa 832(%rdi), %xmm1
13691 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13692 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
13693 ; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm0
13694 ; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm26
13695 ; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm2
13696 ; AVX512-FCP-NEXT: vpermt2d %xmm3, %xmm5, %xmm0
13697 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm27
13698 ; AVX512-FCP-NEXT: vmovdqa 816(%rdi), %xmm1
13699 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13700 ; AVX512-FCP-NEXT: vmovdqa 800(%rdi), %xmm3
13701 ; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13702 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
13703 ; AVX512-FCP-NEXT: vmovdqa 784(%rdi), %xmm1
13704 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13705 ; AVX512-FCP-NEXT: vmovdqa 768(%rdi), %xmm3
13706 ; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13707 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
13708 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
13709 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm23
13710 ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm25
13711 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
13712 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm9
13713 ; AVX512-FCP-NEXT: vmovdqa 992(%rdi), %ymm0
13714 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13715 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
13716 ; AVX512-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
13717 ; AVX512-FCP-NEXT: vmovdqa 960(%rdi), %ymm0
13718 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13719 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,2]
13720 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13721 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,2,0,4,5,6,4]
13722 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
13723 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,4]
13724 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
13725 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
13726 ; AVX512-FCP-NEXT: vmovdqa 928(%rdi), %ymm1
13727 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13728 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,0,2]
13729 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13730 ; AVX512-FCP-NEXT: vmovdqa 896(%rdi), %ymm1
13731 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13732 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm1[0,1,0,2]
13733 ; AVX512-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13734 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
13735 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
13736 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[0,2,2,3,4,6,6,7]
13737 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
13738 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5],ymm12[6,7]
13739 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13740 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm9 {%k1}
13741 ; AVX512-FCP-NEXT: vmovdqa 624(%rdi), %xmm0
13742 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13743 ; AVX512-FCP-NEXT: vmovdqa 608(%rdi), %xmm1
13744 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13745 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
13746 ; AVX512-FCP-NEXT: vmovdqa 592(%rdi), %xmm0
13747 ; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13748 ; AVX512-FCP-NEXT: vmovdqa 576(%rdi), %xmm1
13749 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13750 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
13751 ; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm0
13752 ; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm29
13753 ; AVX512-FCP-NEXT: vpermt2d %xmm11, %xmm2, %xmm0
13754 ; AVX512-FCP-NEXT: vmovdqa64 %xmm11, %xmm16
13755 ; AVX512-FCP-NEXT: vmovdqa 560(%rdi), %xmm1
13756 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13757 ; AVX512-FCP-NEXT: vmovdqa 544(%rdi), %xmm2
13758 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13759 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
13760 ; AVX512-FCP-NEXT: vmovdqa 528(%rdi), %xmm1
13761 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13762 ; AVX512-FCP-NEXT: vmovdqa 512(%rdi), %xmm2
13763 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13764 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
13765 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm11[0],xmm2[1],xmm11[1]
13766 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm22
13767 ; AVX512-FCP-NEXT: vmovdqa64 %xmm11, %xmm21
13768 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3]
13769 ; AVX512-FCP-NEXT: vmovdqa 736(%rdi), %ymm0
13770 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13771 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
13772 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13773 ; AVX512-FCP-NEXT: vmovdqa 704(%rdi), %ymm0
13774 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13775 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2]
13776 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13777 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
13778 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
13779 ; AVX512-FCP-NEXT: vmovdqa64 %ymm1, %ymm18
13780 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
13781 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
13782 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm0[7]
13783 ; AVX512-FCP-NEXT: vmovdqa 672(%rdi), %ymm0
13784 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13785 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
13786 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13787 ; AVX512-FCP-NEXT: vmovdqa 640(%rdi), %ymm0
13788 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13789 ; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm0[0,1,0,2]
13790 ; AVX512-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13791 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,2,2,3,4,6,6,7]
13792 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
13793 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,2,2,3,4,6,6,7]
13794 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
13795 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7]
13796 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7]
13797 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
13798 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
13799 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13800 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [1,5,0,0]
13801 ; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm0
13802 ; AVX512-FCP-NEXT: vpermt2d %xmm17, %xmm13, %xmm0
13803 ; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm30
13804 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm14[0],xmm24[0],xmm14[1],xmm24[1]
13805 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3]
13806 ; AVX512-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
13807 ; AVX512-FCP-NEXT: # ymm9 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
13808 ; AVX512-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
13809 ; AVX512-FCP-NEXT: # ymm11 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
13810 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
13811 ; AVX512-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
13812 ; AVX512-FCP-NEXT: # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
13813 ; AVX512-FCP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
13814 ; AVX512-FCP-NEXT: # ymm12 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
13815 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
13816 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
13817 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
13818 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 {%k1}
13819 ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm12
13820 ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm9
13821 ; AVX512-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload
13822 ; AVX512-FCP-NEXT: vpermt2d %xmm17, %xmm13, %xmm9
13823 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm20[0],xmm19[0],xmm20[1],xmm19[1]
13824 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3]
13825 ; AVX512-FCP-NEXT: vmovdqa64 %ymm28, %ymm11
13826 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
13827 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
13828 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
13829 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
13830 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
13831 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
13832 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7]
13833 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
13834 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0
13835 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13836 ; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm9
13837 ; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm0
13838 ; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm11
13839 ; AVX512-FCP-NEXT: vpermt2d %xmm25, %xmm13, %xmm0
13840 ; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm8
13841 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm26[0],xmm27[0],xmm26[1],xmm27[1]
13842 ; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm19
13843 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
13844 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
13845 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
13846 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
13847 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
13848 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
13849 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
13850 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
13851 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
13852 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
13853 ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm3
13854 ; AVX512-FCP-NEXT: vpermt2d %xmm21, %xmm13, %xmm3
13855 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm29[0],xmm16[0],xmm29[1],xmm16[1]
13856 ; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm6
13857 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
13858 ; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm4
13859 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
13860 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
13861 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
13862 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
13863 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
13864 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
13865 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
13866 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
13867 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
13868 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13869 ; AVX512-FCP-NEXT: vmovdqa %xmm14, %xmm0
13870 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm24[2],xmm14[3],xmm24[3]
13871 ; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
13872 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6]
13873 ; AVX512-FCP-NEXT: vpermt2d %xmm24, %xmm5, %xmm0
13874 ; AVX512-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload
13875 ; AVX512-FCP-NEXT: vmovdqa64 %xmm30, %xmm28
13876 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm30[2],xmm31[3],xmm30[3]
13877 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
13878 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
13879 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
13880 ; AVX512-FCP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
13881 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
13882 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
13883 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
13884 ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
13885 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
13886 ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm29
13887 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13888 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
13889 ; AVX512-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
13890 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
13891 ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
13892 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
13893 ; AVX512-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
13894 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
13895 ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm27
13896 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
13897 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
13898 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
13899 ; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm1
13900 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
13901 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm26 = xmm20[2],xmm2[2],xmm20[3],xmm2[3]
13902 ; AVX512-FCP-NEXT: vpermt2d %xmm2, %xmm5, %xmm1
13903 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm17[2],xmm12[3],xmm17[3]
13904 ; AVX512-FCP-NEXT: vmovdqa64 %xmm12, %xmm25
13905 ; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm23
13906 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
13907 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
13908 ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
13909 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
13910 ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm18
13911 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
13912 ; AVX512-FCP-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7]
13913 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
13914 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
13915 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
13916 ; AVX512-FCP-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7]
13917 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
13918 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
13919 ; AVX512-FCP-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
13920 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
13921 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
13922 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
13923 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
13924 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
13925 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13926 ; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm0
13927 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm17 = xmm19[2],xmm8[2],xmm19[3],xmm8[3]
13928 ; AVX512-FCP-NEXT: vpermt2d %xmm8, %xmm5, %xmm0
13929 ; AVX512-FCP-NEXT: vmovdqa64 %xmm9, %xmm19
13930 ; AVX512-FCP-NEXT: vmovdqa64 %xmm11, %xmm20
13931 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
13932 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
13933 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm24
13934 ; AVX512-FCP-NEXT: vpshufd $212, (%rsp), %ymm9 # 32-byte Folded Reload
13935 ; AVX512-FCP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
13936 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
13937 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
13938 ; AVX512-FCP-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7]
13939 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
13940 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
13941 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
13942 ; AVX512-FCP-NEXT: # ymm8 = mem[3,1,2,3,7,5,6,7]
13943 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
13944 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
13945 ; AVX512-FCP-NEXT: # ymm7 = mem[3,1,2,3,7,5,6,7]
13946 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
13947 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
13948 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
13949 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm24 {%k1}
13950 ; AVX512-FCP-NEXT: vmovdqa %xmm6, %xmm0
13951 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm6[2],xmm16[2],xmm6[3],xmm16[3]
13952 ; AVX512-FCP-NEXT: vpermt2d %xmm16, %xmm5, %xmm0
13953 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm22[2],xmm21[2],xmm22[3],xmm21[3]
13954 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
13955 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
13956 ; AVX512-FCP-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7]
13957 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
13958 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
13959 ; AVX512-FCP-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
13960 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
13961 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
13962 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
13963 ; AVX512-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
13964 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
13965 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
13966 ; AVX512-FCP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
13967 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
13968 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
13969 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
13970 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
13971 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
13972 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
13973 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0]
13974 ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm1
13975 ; AVX512-FCP-NEXT: vpermt2d %xmm28, %xmm16, %xmm1
13976 ; AVX512-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
13977 ; AVX512-FCP-NEXT: # xmm0 = xmm1[0,1],mem[2,3]
13978 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
13979 ; AVX512-FCP-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
13980 ; AVX512-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
13981 ; AVX512-FCP-NEXT: vmovdqa64 %ymm29, %ymm3
13982 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
13983 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
13984 ; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm3
13985 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
13986 ; AVX512-FCP-NEXT: vmovdqa64 %ymm27, %ymm15
13987 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
13988 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7]
13989 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
13990 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
13991 ; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm1
13992 ; AVX512-FCP-NEXT: vpermt2d %xmm23, %xmm16, %xmm1
13993 ; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm3
13994 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
13995 ; AVX512-FCP-NEXT: vmovdqa64 %ymm18, %ymm3
13996 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
13997 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
13998 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7]
13999 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
14000 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
14001 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
14002 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7]
14003 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
14004 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
14005 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14006 ; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm0
14007 ; AVX512-FCP-NEXT: vpermt2d %xmm20, %xmm16, %xmm0
14008 ; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm1
14009 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14010 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14011 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
14012 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
14013 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
14014 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
14015 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
14016 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7]
14017 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
14018 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
14019 ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm1
14020 ; AVX512-FCP-NEXT: vpermt2d %xmm21, %xmm16, %xmm1
14021 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
14022 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
14023 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
14024 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
14025 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
14026 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
14027 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
14028 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
14029 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
14030 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
14031 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14032 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14033 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
14034 ; AVX512-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14035 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14036 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
14037 ; AVX512-FCP-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14038 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14039 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
14040 ; AVX512-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14041 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14042 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
14043 ; AVX512-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14044 ; AVX512-FCP-NEXT: vmovdqa %xmm9, %xmm0
14045 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4]
14046 ; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm5, %xmm0
14047 ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm29
14048 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
14049 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm23
14050 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm19
14051 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
14052 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14053 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
14054 ; AVX512-FCP-NEXT: # ymm1 = mem[0,1,1,3]
14055 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14056 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
14057 ; AVX512-FCP-NEXT: # ymm2 = mem[0,1,1,3]
14058 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14059 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
14060 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14061 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
14062 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
14063 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14064 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
14065 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
14066 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
14067 ; AVX512-FCP-NEXT: # ymm2 = mem[0,1,1,3]
14068 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14069 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
14070 ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,1,3]
14071 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14072 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
14073 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14074 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
14075 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
14076 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14077 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
14078 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
14079 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14080 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
14081 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14082 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload
14083 ; AVX512-FCP-NEXT: # xmm6 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
14084 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14085 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
14086 ; AVX512-FCP-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
14087 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14088 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
14089 ; AVX512-FCP-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
14090 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14091 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
14092 ; AVX512-FCP-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
14093 ; AVX512-FCP-NEXT: vmovdqa %xmm2, %xmm1
14094 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm27
14095 ; AVX512-FCP-NEXT: vpermt2d %xmm6, %xmm5, %xmm1
14096 ; AVX512-FCP-NEXT: vmovdqa64 %xmm6, %xmm18
14097 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
14098 ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm22
14099 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm24
14100 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
14101 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
14102 ; AVX512-FCP-NEXT: # ymm2 = mem[0,1,1,3]
14103 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14104 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
14105 ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,1,3]
14106 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14107 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
14108 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14109 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
14110 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4]
14111 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
14112 ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm26
14113 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
14114 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
14115 ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,1,3]
14116 ; AVX512-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14117 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
14118 ; AVX512-FCP-NEXT: # ymm4 = mem[0,1,1,3]
14119 ; AVX512-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14120 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[0,2,2,3,4,6,6,7]
14121 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
14122 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[0,2,2,3,4,6,6,7]
14123 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
14124 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
14125 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
14126 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
14127 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
14128 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14129 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14130 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
14131 ; AVX512-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14132 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14133 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
14134 ; AVX512-FCP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14135 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14136 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
14137 ; AVX512-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14138 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14139 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
14140 ; AVX512-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14141 ; AVX512-FCP-NEXT: vmovdqa %xmm1, %xmm0
14142 ; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm31
14143 ; AVX512-FCP-NEXT: vpermt2d %xmm4, %xmm5, %xmm0
14144 ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm20
14145 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
14146 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm21
14147 ; AVX512-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14148 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm25
14149 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
14150 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm12
14151 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
14152 ; AVX512-FCP-NEXT: # ymm0 = mem[0,1,1,3]
14153 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14154 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
14155 ; AVX512-FCP-NEXT: # ymm1 = mem[0,1,1,3]
14156 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14157 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,1,2,0,4,5,6,4]
14158 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
14159 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,0,4,5,6,4]
14160 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
14161 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
14162 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
14163 ; AVX512-FCP-NEXT: # ymm1 = mem[0,1,1,3]
14164 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14165 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
14166 ; AVX512-FCP-NEXT: # ymm2 = mem[0,1,1,3]
14167 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14168 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[0,2,2,3,4,6,6,7]
14169 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
14170 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,2,2,3,4,6,6,7]
14171 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
14172 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
14173 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14174 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
14175 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14176 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
14177 ; AVX512-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14178 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14179 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
14180 ; AVX512-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14181 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14182 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload
14183 ; AVX512-FCP-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14184 ; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14185 ; AVX512-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
14186 ; AVX512-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14187 ; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm1
14188 ; AVX512-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm1
14189 ; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
14190 ; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm16
14191 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm13[0],xmm4[1],xmm13[1]
14192 ; AVX512-FCP-NEXT: vmovdqa64 %xmm13, %xmm28
14193 ; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm30
14194 ; AVX512-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14195 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm1[2,3]
14196 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
14197 ; AVX512-FCP-NEXT: # ymm0 = mem[0,1,1,3]
14198 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14199 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
14200 ; AVX512-FCP-NEXT: # ymm1 = mem[0,1,1,3]
14201 ; AVX512-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14202 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,0,4,5,6,4]
14203 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
14204 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,2,0,4,5,6,4]
14205 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
14206 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7]
14207 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
14208 ; AVX512-FCP-NEXT: # ymm0 = mem[0,1,1,3]
14209 ; AVX512-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14210 ; AVX512-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
14211 ; AVX512-FCP-NEXT: # ymm2 = mem[0,1,1,3]
14212 ; AVX512-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14213 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
14214 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
14215 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
14216 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
14217 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
14218 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
14219 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
14220 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
14221 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14222 ; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm0
14223 ; AVX512-FCP-NEXT: vmovdqa64 %xmm23, (%rsp) # 16-byte Spill
14224 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0]
14225 ; AVX512-FCP-NEXT: vpermt2d %xmm19, %xmm15, %xmm0
14226 ; AVX512-FCP-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14227 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm29[0],xmm9[1],xmm29[1]
14228 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14229 ; AVX512-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
14230 ; AVX512-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
14231 ; AVX512-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
14232 ; AVX512-FCP-NEXT: # ymm12 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
14233 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7]
14234 ; AVX512-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
14235 ; AVX512-FCP-NEXT: # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
14236 ; AVX512-FCP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
14237 ; AVX512-FCP-NEXT: # ymm13 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
14238 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7]
14239 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14240 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7]
14241 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
14242 ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm1
14243 ; AVX512-FCP-NEXT: vpermt2d %xmm24, %xmm15, %xmm1
14244 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm27[0],xmm18[0],xmm27[1],xmm18[1]
14245 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3]
14246 ; AVX512-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
14247 ; AVX512-FCP-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
14248 ; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm13
14249 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
14250 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
14251 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
14252 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
14253 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7]
14254 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm12[6,7]
14255 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
14256 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
14257 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14258 ; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm0
14259 ; AVX512-FCP-NEXT: vpermt2d %xmm25, %xmm15, %xmm0
14260 ; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm13
14261 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm31[0],xmm20[0],xmm31[1],xmm20[1]
14262 ; AVX512-FCP-NEXT: vmovdqa64 %xmm31, %xmm21
14263 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14264 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
14265 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
14266 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7]
14267 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
14268 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
14269 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
14270 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
14271 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14272 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
14273 ; AVX512-FCP-NEXT: vmovdqa %xmm15, %xmm6
14274 ; AVX512-FCP-NEXT: vpermi2d %xmm28, %xmm30, %xmm6
14275 ; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm16[0],xmm17[1],xmm16[1]
14276 ; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm7
14277 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
14278 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
14279 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
14280 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
14281 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
14282 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
14283 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
14284 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
14285 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
14286 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
14287 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14288 ; AVX512-FCP-NEXT: vmovdqa %xmm9, %xmm0
14289 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm29[2],xmm9[3],xmm29[3]
14290 ; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14291 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6]
14292 ; AVX512-FCP-NEXT: vpermt2d %xmm29, %xmm5, %xmm0
14293 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm23[2],xmm19[2],xmm23[3],xmm19[3]
14294 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
14295 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14296 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
14297 ; AVX512-FCP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
14298 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
14299 ; AVX512-FCP-NEXT: vmovdqa64 %ymm2, %ymm31
14300 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
14301 ; AVX512-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
14302 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
14303 ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
14304 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
14305 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
14306 ; AVX512-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
14307 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
14308 ; AVX512-FCP-NEXT: vmovdqa64 %ymm3, %ymm26
14309 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
14310 ; AVX512-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
14311 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
14312 ; AVX512-FCP-NEXT: vmovdqa64 %ymm4, %ymm23
14313 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
14314 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14315 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
14316 ; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm1
14317 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm20 = xmm27[2],xmm18[2],xmm27[3],xmm18[3]
14318 ; AVX512-FCP-NEXT: vpermt2d %xmm18, %xmm5, %xmm1
14319 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm22[2],xmm24[2],xmm22[3],xmm24[3]
14320 ; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm19
14321 ; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm18
14322 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
14323 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
14324 ; AVX512-FCP-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7]
14325 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
14326 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
14327 ; AVX512-FCP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
14328 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
14329 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
14330 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
14331 ; AVX512-FCP-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
14332 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
14333 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
14334 ; AVX512-FCP-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
14335 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
14336 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
14337 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
14338 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
14339 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
14340 ; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14341 ; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm0
14342 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm21[2],xmm13[2],xmm21[3],xmm13[3]
14343 ; AVX512-FCP-NEXT: vpermt2d %xmm13, %xmm5, %xmm0
14344 ; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm22
14345 ; AVX512-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm24 # 16-byte Reload
14346 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm24[2],xmm25[2],xmm24[3],xmm25[3]
14347 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
14348 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm21
14349 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
14350 ; AVX512-FCP-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7]
14351 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
14352 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
14353 ; AVX512-FCP-NEXT: # ymm15 = mem[0,1,1,3,4,5,5,7]
14354 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
14355 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
14356 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
14357 ; AVX512-FCP-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7]
14358 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
14359 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
14360 ; AVX512-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
14361 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
14362 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
14363 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14364 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm21 {%k1}
14365 ; AVX512-FCP-NEXT: vmovdqa %xmm5, %xmm1
14366 ; AVX512-FCP-NEXT: vmovdqa64 %xmm7, %xmm25
14367 ; AVX512-FCP-NEXT: vmovdqa64 %xmm17, %xmm27
14368 ; AVX512-FCP-NEXT: vpermi2d %xmm7, %xmm17, %xmm1
14369 ; AVX512-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
14370 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm29[2],xmm28[2],xmm29[3],xmm28[3]
14371 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3]
14372 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
14373 ; AVX512-FCP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
14374 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
14375 ; AVX512-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
14376 ; AVX512-FCP-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7]
14377 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
14378 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
14379 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
14380 ; AVX512-FCP-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7]
14381 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
14382 ; AVX512-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
14383 ; AVX512-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
14384 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
14385 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
14386 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
14387 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
14388 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
14389 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0]
14390 ; AVX512-FCP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
14391 ; AVX512-FCP-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm17, %xmm0 # 16-byte Folded Reload
14392 ; AVX512-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
14393 ; AVX512-FCP-NEXT: # xmm1 = xmm0[0,1],mem[2,3]
14394 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
14395 ; AVX512-FCP-NEXT: vmovdqa64 %ymm31, %ymm0
14396 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
14397 ; AVX512-FCP-NEXT: vmovdqa64 %ymm30, %ymm0
14398 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm0[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
14399 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm2[7]
14400 ; AVX512-FCP-NEXT: vmovdqa64 %ymm26, %ymm0
14401 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
14402 ; AVX512-FCP-NEXT: vmovdqa64 %ymm23, %ymm0
14403 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
14404 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7]
14405 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
14406 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
14407 ; AVX512-FCP-NEXT: vmovdqa64 %xmm19, %xmm0
14408 ; AVX512-FCP-NEXT: vpermt2d %xmm18, %xmm17, %xmm0
14409 ; AVX512-FCP-NEXT: vmovdqa64 %xmm20, %xmm2
14410 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
14411 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
14412 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
14413 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7]
14414 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
14415 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
14416 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
14417 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7]
14418 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
14419 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
14420 ; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm1
14421 ; AVX512-FCP-NEXT: vpermt2d %xmm22, %xmm17, %xmm1
14422 ; AVX512-FCP-NEXT: vmovdqa64 %xmm16, %xmm2
14423 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
14424 ; AVX512-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
14425 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
14426 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm15[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
14427 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7]
14428 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
14429 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
14430 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7]
14431 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
14432 ; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 {%k1}
14433 ; AVX512-FCP-NEXT: vmovdqa64 %xmm29, %xmm4
14434 ; AVX512-FCP-NEXT: vpermt2d %xmm28, %xmm17, %xmm4
14435 ; AVX512-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm27[2],xmm25[2],xmm27[3],xmm25[3]
14436 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
14437 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
14438 ; AVX512-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
14439 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
14440 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
14441 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
14442 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7]
14443 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
14444 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
14445 ; AVX512-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
14446 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14447 ; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rsi)
14448 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14449 ; AVX512-FCP-NEXT: vmovaps %zmm2, (%rsi)
14450 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14451 ; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rdx)
14452 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14453 ; AVX512-FCP-NEXT: vmovaps %zmm2, (%rdx)
14454 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14455 ; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rcx)
14456 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14457 ; AVX512-FCP-NEXT: vmovaps %zmm2, (%rcx)
14458 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14459 ; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%r8)
14460 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14461 ; AVX512-FCP-NEXT: vmovaps %zmm2, (%r8)
14462 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14463 ; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%r9)
14464 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14465 ; AVX512-FCP-NEXT: vmovaps %zmm2, (%r9)
14466 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
14467 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14468 ; AVX512-FCP-NEXT: vmovaps %zmm2, 64(%rax)
14469 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14470 ; AVX512-FCP-NEXT: vmovaps %zmm2, (%rax)
14471 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
14472 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 64(%rax)
14473 ; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
14474 ; AVX512-FCP-NEXT: vmovaps %zmm2, (%rax)
14475 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
14476 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax)
14477 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
14478 ; AVX512-FCP-NEXT: addq $2312, %rsp # imm = 0x908
14479 ; AVX512-FCP-NEXT: vzeroupper
14480 ; AVX512-FCP-NEXT: retq
14482 ; AVX512DQ-LABEL: load_i16_stride8_vf64:
14483 ; AVX512DQ: # %bb.0:
14484 ; AVX512DQ-NEXT: subq $2408, %rsp # imm = 0x968
14485 ; AVX512DQ-NEXT: vmovdqa 368(%rdi), %xmm1
14486 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14487 ; AVX512DQ-NEXT: vmovdqa 352(%rdi), %xmm0
14488 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14489 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
14490 ; AVX512DQ-NEXT: vmovdqa 336(%rdi), %xmm2
14491 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14492 ; AVX512DQ-NEXT: vmovdqa 320(%rdi), %xmm1
14493 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14494 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
14495 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm6 = [0,0,0,4]
14496 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
14497 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14498 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
14499 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
14500 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14501 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
14502 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14503 ; AVX512DQ-NEXT: vpermt2d %xmm0, %xmm6, %xmm1
14504 ; AVX512DQ-NEXT: vmovdqa 304(%rdi), %xmm0
14505 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14506 ; AVX512DQ-NEXT: vmovdqa 288(%rdi), %xmm2
14507 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14508 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
14509 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14510 ; AVX512DQ-NEXT: vmovdqa 272(%rdi), %xmm0
14511 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14512 ; AVX512DQ-NEXT: vmovdqa 256(%rdi), %xmm2
14513 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14514 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
14515 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
14516 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm31
14517 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14518 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14519 ; AVX512DQ-NEXT: vmovdqa 480(%rdi), %ymm1
14520 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14521 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
14522 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14523 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
14524 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14525 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
14526 ; AVX512DQ-NEXT: vmovdqa 448(%rdi), %ymm2
14527 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14528 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
14529 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14530 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
14531 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14532 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
14533 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
14534 ; AVX512DQ-NEXT: vmovdqa 416(%rdi), %ymm2
14535 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14536 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2]
14537 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14538 ; AVX512DQ-NEXT: vmovdqa 384(%rdi), %ymm2
14539 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14540 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm2[0,1,0,2]
14541 ; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14542 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7]
14543 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14544 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
14545 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm5[0,2,2,3,4,6,6,7]
14546 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14547 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
14548 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
14549 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14550 ; AVX512DQ-NEXT: movb $-64, %al
14551 ; AVX512DQ-NEXT: kmovw %eax, %k1
14552 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
14553 ; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm2
14554 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14555 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm1
14556 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14557 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
14558 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm3
14559 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14560 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm2
14561 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14562 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
14563 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,2,2]
14564 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
14565 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14566 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14567 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14568 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
14569 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14570 ; AVX512DQ-NEXT: vpermt2d %xmm1, %xmm6, %xmm2
14571 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1
14572 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14573 ; AVX512DQ-NEXT: vmovdqa 16(%rdi), %xmm3
14574 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14575 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm4
14576 ; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14577 ; AVX512DQ-NEXT: vmovdqa 48(%rdi), %xmm5
14578 ; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14579 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
14580 ; AVX512DQ-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14581 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
14582 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
14583 ; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm17
14584 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
14585 ; AVX512DQ-NEXT: vmovdqa 224(%rdi), %ymm2
14586 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14587 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2]
14588 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14589 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm2
14590 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14591 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2]
14592 ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14593 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,4]
14594 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14595 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
14596 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,1,2,0,4,5,6,4]
14597 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14598 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
14599 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
14600 ; AVX512DQ-NEXT: vmovdqa 160(%rdi), %ymm3
14601 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14602 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm24 = ymm3[0,1,0,2]
14603 ; AVX512DQ-NEXT: vmovdqa 128(%rdi), %ymm3
14604 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14605 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm22 = ymm3[0,1,0,2]
14606 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm24[0,2,2,3,4,6,6,7]
14607 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14608 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
14609 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm22[0,2,2,3,4,6,6,7]
14610 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm12[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
14611 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
14612 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
14613 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
14614 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
14615 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14616 ; AVX512DQ-NEXT: vmovdqa 880(%rdi), %xmm1
14617 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14618 ; AVX512DQ-NEXT: vmovdqa 864(%rdi), %xmm0
14619 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14620 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
14621 ; AVX512DQ-NEXT: vmovdqa 848(%rdi), %xmm2
14622 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14623 ; AVX512DQ-NEXT: vmovdqa 832(%rdi), %xmm1
14624 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14625 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
14626 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
14627 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
14628 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14629 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
14630 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14631 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
14632 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14633 ; AVX512DQ-NEXT: vpermt2d %xmm0, %xmm6, %xmm1
14634 ; AVX512DQ-NEXT: vmovdqa %xmm6, %xmm10
14635 ; AVX512DQ-NEXT: vmovdqa 816(%rdi), %xmm0
14636 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14637 ; AVX512DQ-NEXT: vmovdqa 800(%rdi), %xmm2
14638 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14639 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
14640 ; AVX512DQ-NEXT: vmovdqa 784(%rdi), %xmm0
14641 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14642 ; AVX512DQ-NEXT: vmovdqa 768(%rdi), %xmm2
14643 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14644 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
14645 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
14646 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm19
14647 ; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm20
14648 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
14649 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14650 ; AVX512DQ-NEXT: vmovdqa 992(%rdi), %ymm1
14651 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14652 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,0,2]
14653 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14654 ; AVX512DQ-NEXT: vmovdqa 960(%rdi), %ymm1
14655 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14656 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm1[0,1,0,2]
14657 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14658 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm3[0,1,2,0,4,5,6,4]
14659 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
14660 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,1,2,0,4,5,6,4]
14661 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm6[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
14662 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
14663 ; AVX512DQ-NEXT: vmovdqa 928(%rdi), %ymm2
14664 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14665 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2]
14666 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14667 ; AVX512DQ-NEXT: vmovdqa 896(%rdi), %ymm2
14668 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14669 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm18 = ymm2[0,1,0,2]
14670 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
14671 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
14672 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7]
14673 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm9 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
14674 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4],ymm2[5],ymm9[6,7]
14675 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14676 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
14677 ; AVX512DQ-NEXT: vmovdqa 624(%rdi), %xmm2
14678 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14679 ; AVX512DQ-NEXT: vmovdqa 608(%rdi), %xmm1
14680 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14681 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
14682 ; AVX512DQ-NEXT: vmovdqa 592(%rdi), %xmm5
14683 ; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14684 ; AVX512DQ-NEXT: vmovdqa 576(%rdi), %xmm2
14685 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14686 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
14687 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[2,2,2,2]
14688 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm5 = xmm2[0,1,2],xmm9[3]
14689 ; AVX512DQ-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill
14690 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm16 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14691 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
14692 ; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14693 ; AVX512DQ-NEXT: vpermt2d %xmm1, %xmm10, %xmm2
14694 ; AVX512DQ-NEXT: vmovdqa 560(%rdi), %xmm1
14695 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14696 ; AVX512DQ-NEXT: vmovdqa 544(%rdi), %xmm5
14697 ; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14698 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm8 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
14699 ; AVX512DQ-NEXT: vmovdqa 528(%rdi), %xmm1
14700 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14701 ; AVX512DQ-NEXT: vmovdqa 512(%rdi), %xmm5
14702 ; AVX512DQ-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14703 ; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
14704 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm15[0],xmm8[0],xmm15[1],xmm8[1]
14705 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm1[0,1],xmm2[2,3]
14706 ; AVX512DQ-NEXT: vmovdqa 736(%rdi), %ymm1
14707 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14708 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm29 = ymm1[0,1,0,2]
14709 ; AVX512DQ-NEXT: vmovdqa 704(%rdi), %ymm1
14710 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14711 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
14712 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14713 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm29[0,1,2,0,4,5,6,4]
14714 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm10 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
14715 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
14716 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
14717 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm11[0,1,2,3,4,5,6],ymm10[7]
14718 ; AVX512DQ-NEXT: vmovdqa 672(%rdi), %ymm5
14719 ; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14720 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm28 = ymm5[0,1,0,2]
14721 ; AVX512DQ-NEXT: vmovdqa 640(%rdi), %ymm5
14722 ; AVX512DQ-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14723 ; AVX512DQ-NEXT: vpermq {{.*#+}} ymm30 = ymm5[0,1,0,2]
14724 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm28[0,2,2,3,4,6,6,7]
14725 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm14 = ymm10[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
14726 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm30[0,2,2,3,4,6,6,7]
14727 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm5 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
14728 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm14[5],ymm5[6,7]
14729 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm13[6,7]
14730 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
14731 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0
14732 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14733 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm31[1,1,1,1]
14734 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
14735 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm14[1],xmm0[2,3]
14736 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14737 ; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
14738 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14739 ; AVX512DQ-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
14740 ; AVX512DQ-NEXT: # ymm5 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
14741 ; AVX512DQ-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
14742 ; AVX512DQ-NEXT: # ymm9 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
14743 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7]
14744 ; AVX512DQ-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
14745 ; AVX512DQ-NEXT: # ymm9 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
14746 ; AVX512DQ-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
14747 ; AVX512DQ-NEXT: # ymm13 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
14748 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5],ymm13[6,7]
14749 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
14750 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 {%k1}
14751 ; AVX512DQ-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
14752 ; AVX512DQ-NEXT: # ymm5 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
14753 ; AVX512DQ-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
14754 ; AVX512DQ-NEXT: # ymm9 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
14755 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5,6],ymm5[7]
14756 ; AVX512DQ-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
14757 ; AVX512DQ-NEXT: # ymm9 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
14758 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm12[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
14759 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm13[0,1,2,3,4],ymm9[5],ymm13[6,7]
14760 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
14761 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm17[1,1,1,1]
14762 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
14763 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm13[1],xmm9[2,3]
14764 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
14765 ; AVX512DQ-NEXT: # xmm9 = xmm9[0,1],mem[2,3]
14766 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
14767 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm5, %zmm0, %zmm0
14768 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14769 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm19[1,1,1,1]
14770 ; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm9
14771 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm9[1],xmm0[2,3]
14772 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14773 ; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
14774 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
14775 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
14776 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4,5,6],ymm5[7]
14777 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
14778 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
14779 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
14780 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
14781 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14782 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
14783 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm15[1,1,1,1]
14784 ; AVX512DQ-NEXT: vmovdqa %xmm8, %xmm5
14785 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm8[1],xmm3[2,3]
14786 ; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm4
14787 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
14788 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
14789 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
14790 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
14791 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm10[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
14792 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
14793 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
14794 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14795 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
14796 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
14797 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14798 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm31[2],xmm14[2],xmm31[3],xmm14[3]
14799 ; AVX512DQ-NEXT: vmovdqa64 %xmm14, %xmm16
14800 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14801 ; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
14802 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14803 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
14804 ; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
14805 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
14806 ; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm27
14807 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
14808 ; AVX512DQ-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
14809 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
14810 ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm26
14811 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
14812 ; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
14813 ; AVX512DQ-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
14814 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
14815 ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm23
14816 ; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
14817 ; AVX512DQ-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
14818 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
14819 ; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm21
14820 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
14821 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14822 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
14823 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
14824 ; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
14825 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
14826 ; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm25
14827 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
14828 ; AVX512DQ-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7]
14829 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
14830 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
14831 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm24[3,1,2,3,7,5,6,7]
14832 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
14833 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm22[3,1,2,3,7,5,6,7]
14834 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
14835 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
14836 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14837 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm13[2],xmm17[3],xmm13[3]
14838 ; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm22
14839 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
14840 ; AVX512DQ-NEXT: # xmm2 = xmm2[0,1],mem[2,3]
14841 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
14842 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
14843 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14844 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm19[2],xmm20[2],xmm19[3],xmm20[3]
14845 ; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm24
14846 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14847 ; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
14848 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20
14849 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
14850 ; AVX512DQ-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7]
14851 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
14852 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
14853 ; AVX512DQ-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
14854 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
14855 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
14856 ; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
14857 ; AVX512DQ-NEXT: # ymm8 = mem[3,1,2,3,7,5,6,7]
14858 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
14859 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm18[3,1,2,3,7,5,6,7]
14860 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
14861 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
14862 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
14863 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 {%k1}
14864 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm15[2],xmm5[2],xmm15[3],xmm5[3]
14865 ; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm18
14866 ; AVX512DQ-NEXT: vpblendd $12, (%rsp), %xmm0, %xmm3 # 16-byte Folded Reload
14867 ; AVX512DQ-NEXT: # xmm3 = xmm0[0,1],mem[2,3]
14868 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[0,1,1,3,4,5,5,7]
14869 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
14870 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
14871 ; AVX512DQ-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
14872 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
14873 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
14874 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm28[3,1,2,3,7,5,6,7]
14875 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
14876 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm30[3,1,2,3,7,5,6,7]
14877 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
14878 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
14879 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
14880 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
14881 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
14882 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14883 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm0 = [3,7,0,0]
14884 ; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm1
14885 ; AVX512DQ-NEXT: vpermt2d %xmm16, %xmm0, %xmm1
14886 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm16
14887 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
14888 ; AVX512DQ-NEXT: # xmm0 = xmm1[0,1],mem[2,3]
14889 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14890 ; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm1
14891 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
14892 ; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm3
14893 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
14894 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
14895 ; AVX512DQ-NEXT: vmovdqa64 %ymm23, %ymm3
14896 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
14897 ; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm13
14898 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
14899 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4],ymm3[5],ymm13[6,7]
14900 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
14901 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
14902 ; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm1
14903 ; AVX512DQ-NEXT: vpermt2d %xmm22, %xmm16, %xmm1
14904 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14905 ; AVX512DQ-NEXT: # xmm1 = xmm1[0,1],mem[2,3]
14906 ; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm3
14907 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
14908 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm13 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
14909 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7]
14910 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
14911 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
14912 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7]
14913 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm11[0,1,2,3,4,5],ymm3[6,7]
14914 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
14915 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
14916 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14917 ; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm0
14918 ; AVX512DQ-NEXT: vpermt2d %xmm24, %xmm16, %xmm0
14919 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
14920 ; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
14921 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14922 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
14923 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
14924 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
14925 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
14926 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
14927 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4],ymm3[5],ymm6[6,7]
14928 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
14929 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
14930 ; AVX512DQ-NEXT: vpermt2d %xmm18, %xmm16, %xmm15
14931 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm1 # 16-byte Folded Reload
14932 ; AVX512DQ-NEXT: # xmm1 = xmm15[0,1],mem[2,3]
14933 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
14934 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
14935 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
14936 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
14937 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
14938 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
14939 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
14940 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
14941 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
14942 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
14943 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14944 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm5 # 16-byte Folded Reload
14945 ; AVX512DQ-NEXT: # xmm5 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14946 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14947 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
14948 ; AVX512DQ-NEXT: # xmm6 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14949 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[2,2,2,2]
14950 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm6[0,1,2],xmm0[3]
14951 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14952 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
14953 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14954 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm6[2],xmm5[2],xmm6[3],xmm5[3]
14955 ; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14956 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm10 = [0,0,0,4]
14957 ; AVX512DQ-NEXT: vpermt2d %xmm5, %xmm10, %xmm6
14958 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14959 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
14960 ; AVX512DQ-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14961 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
14962 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
14963 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
14964 ; AVX512DQ-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
14965 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
14966 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm28
14967 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm6[2,3]
14968 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
14969 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
14970 ; AVX512DQ-NEXT: # ymm1 = mem[0,1,1,3]
14971 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14972 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
14973 ; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3]
14974 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14975 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
14976 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14977 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
14978 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
14979 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14980 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
14981 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
14982 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
14983 ; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3]
14984 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14985 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
14986 ; AVX512DQ-NEXT: # ymm3 = mem[0,1,1,3]
14987 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14988 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
14989 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14990 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
14991 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
14992 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
14993 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
14994 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
14995 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
14996 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
14997 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
14998 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
14999 ; AVX512DQ-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
15000 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
15001 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
15002 ; AVX512DQ-NEXT: # xmm2 = xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
15003 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[2,2,2,2]
15004 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm2[0,1,2],xmm3[3]
15005 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15006 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
15007 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15008 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
15009 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15010 ; AVX512DQ-NEXT: vpermt2d %xmm1, %xmm10, %xmm2
15011 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15012 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
15013 ; AVX512DQ-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
15014 ; AVX512DQ-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15015 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15016 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
15017 ; AVX512DQ-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
15018 ; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15019 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
15020 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
15021 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
15022 ; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3]
15023 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15024 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
15025 ; AVX512DQ-NEXT: # ymm3 = mem[0,1,1,3]
15026 ; AVX512DQ-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15027 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
15028 ; AVX512DQ-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15029 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
15030 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4]
15031 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
15032 ; AVX512DQ-NEXT: vmovdqa64 %ymm4, %ymm17
15033 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
15034 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm30 # 32-byte Folded Reload
15035 ; AVX512DQ-NEXT: # ymm30 = mem[0,1,1,3]
15036 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
15037 ; AVX512DQ-NEXT: # ymm4 = mem[0,1,1,3]
15038 ; AVX512DQ-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15039 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm30[0,2,2,3,4,6,6,7]
15040 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm9[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
15041 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm4[0,2,2,3,4,6,6,7]
15042 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
15043 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
15044 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
15045 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
15046 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
15047 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15048 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15049 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
15050 ; AVX512DQ-NEXT: # xmm0 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15051 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15052 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
15053 ; AVX512DQ-NEXT: # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
15054 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,2,2]
15055 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm2 = xmm1[0,1,2],xmm2[3]
15056 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15057 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm23 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
15058 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
15059 ; AVX512DQ-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15060 ; AVX512DQ-NEXT: vpermt2d %xmm0, %xmm10, %xmm1
15061 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15062 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
15063 ; AVX512DQ-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15064 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15065 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
15066 ; AVX512DQ-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15067 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
15068 ; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm16
15069 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm20
15070 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
15071 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm12
15072 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
15073 ; AVX512DQ-NEXT: # ymm0 = mem[0,1,1,3]
15074 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15075 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
15076 ; AVX512DQ-NEXT: # ymm1 = mem[0,1,1,3]
15077 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15078 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[0,1,2,0,4,5,6,4]
15079 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
15080 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm5 = ymm1[0,1,2,0,4,5,6,4]
15081 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
15082 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
15083 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm22 # 32-byte Folded Reload
15084 ; AVX512DQ-NEXT: # ymm22 = mem[0,1,1,3]
15085 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm29 # 32-byte Folded Reload
15086 ; AVX512DQ-NEXT: # ymm29 = mem[0,1,1,3]
15087 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm4 = ymm22[0,2,2,3,4,6,6,7]
15088 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
15089 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm29[0,2,2,3,4,6,6,7]
15090 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
15091 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4],ymm1[5],ymm8[6,7]
15092 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
15093 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
15094 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15095 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
15096 ; AVX512DQ-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15097 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15098 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm8 # 16-byte Folded Reload
15099 ; AVX512DQ-NEXT: # xmm8 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15100 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15101 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm11 # 16-byte Folded Reload
15102 ; AVX512DQ-NEXT: # xmm11 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15103 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15104 ; AVX512DQ-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload
15105 ; AVX512DQ-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15106 ; AVX512DQ-NEXT: vmovdqa %xmm10, %xmm1
15107 ; AVX512DQ-NEXT: vpermi2d %xmm2, %xmm8, %xmm1
15108 ; AVX512DQ-NEXT: vmovdqa64 %xmm8, %xmm19
15109 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm24
15110 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
15111 ; AVX512DQ-NEXT: vmovdqa64 %xmm13, %xmm31
15112 ; AVX512DQ-NEXT: vmovdqa64 %xmm11, %xmm21
15113 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm14 = xmm0[0,1],xmm1[2,3]
15114 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
15115 ; AVX512DQ-NEXT: # ymm1 = mem[0,1,1,3]
15116 ; AVX512DQ-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15117 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
15118 ; AVX512DQ-NEXT: # ymm0 = mem[0,1,1,3]
15119 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15120 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm1[0,1,2,0,4,5,6,4]
15121 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm8 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
15122 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm0[0,1,2,0,4,5,6,4]
15123 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm11 = ymm1[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
15124 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm15 = ymm11[0,1,2,3,4,5,6],ymm8[7]
15125 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
15126 ; AVX512DQ-NEXT: # ymm0 = mem[0,1,1,3]
15127 ; AVX512DQ-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15128 ; AVX512DQ-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
15129 ; AVX512DQ-NEXT: # ymm10 = mem[0,1,1,3]
15130 ; AVX512DQ-NEXT: vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15131 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,2,3,4,6,6,7]
15132 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
15133 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm11 = ymm10[0,2,2,3,4,6,6,7]
15134 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm11[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
15135 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
15136 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm15[6,7]
15137 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
15138 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
15139 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15140 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm28[1,1,1,1]
15141 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
15142 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm10[1],xmm0[2,3]
15143 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
15144 ; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
15145 ; AVX512DQ-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
15146 ; AVX512DQ-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
15147 ; AVX512DQ-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
15148 ; AVX512DQ-NEXT: # ymm13 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
15149 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
15150 ; AVX512DQ-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
15151 ; AVX512DQ-NEXT: # ymm13 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
15152 ; AVX512DQ-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
15153 ; AVX512DQ-NEXT: # ymm14 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
15154 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4],ymm13[5],ymm14[6,7]
15155 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
15156 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
15157 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm12, %zmm0, %zmm0 {%k1}
15158 ; AVX512DQ-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
15159 ; AVX512DQ-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
15160 ; AVX512DQ-NEXT: vmovdqa64 %ymm17, %ymm13
15161 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
15162 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
15163 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm9 = ymm9[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
15164 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
15165 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm9[5],ymm7[6,7]
15166 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm12[6,7]
15167 ; AVX512DQ-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload
15168 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm17[1,1,1,1]
15169 ; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
15170 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0],xmm15[1],xmm9[2,3]
15171 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9, %xmm9 # 16-byte Folded Reload
15172 ; AVX512DQ-NEXT: # xmm9 = xmm9[0,1],mem[2,3]
15173 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
15174 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0
15175 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15176 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm16[1,1,1,1]
15177 ; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm7
15178 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm7[1],xmm0[2,3]
15179 ; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm9
15180 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3]
15181 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
15182 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
15183 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
15184 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
15185 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
15186 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
15187 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
15188 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
15189 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
15190 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm31[1,1,1,1]
15191 ; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm6
15192 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3]
15193 ; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm5
15194 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm19[0],xmm24[0],xmm19[1],xmm24[1]
15195 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
15196 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
15197 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
15198 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
15199 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
15200 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
15201 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm2[5],ymm4[6,7]
15202 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
15203 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
15204 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
15205 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15206 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm28[2],xmm10[2],xmm28[3],xmm10[3]
15207 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
15208 ; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
15209 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
15210 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
15211 ; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
15212 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
15213 ; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm27
15214 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
15215 ; AVX512DQ-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
15216 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
15217 ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm26
15218 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
15219 ; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
15220 ; AVX512DQ-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
15221 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
15222 ; AVX512DQ-NEXT: vmovdqa64 %ymm3, %ymm25
15223 ; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
15224 ; AVX512DQ-NEXT: # ymm13 = mem[3,1,2,3,7,5,6,7]
15225 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm13[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
15226 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
15227 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
15228 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
15229 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
15230 ; AVX512DQ-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
15231 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
15232 ; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm21
15233 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
15234 ; AVX512DQ-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7]
15235 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
15236 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
15237 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm12 = ymm30[3,1,2,3,7,5,6,7]
15238 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
15239 ; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
15240 ; AVX512DQ-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
15241 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
15242 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
15243 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
15244 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm17[2],xmm15[2],xmm17[3],xmm15[3]
15245 ; AVX512DQ-NEXT: vmovdqa64 %xmm15, %xmm23
15246 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
15247 ; AVX512DQ-NEXT: # xmm2 = xmm2[0,1],mem[2,3]
15248 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
15249 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
15250 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15251 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm16[2],xmm20[2],xmm16[3],xmm20[3]
15252 ; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm19
15253 ; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm30
15254 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
15255 ; AVX512DQ-NEXT: # xmm0 = xmm0[0,1],mem[2,3]
15256 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm20
15257 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
15258 ; AVX512DQ-NEXT: # ymm10 = mem[0,1,1,3,4,5,5,7]
15259 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm0 = ymm10[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
15260 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
15261 ; AVX512DQ-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
15262 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
15263 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
15264 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm8 = ymm22[3,1,2,3,7,5,6,7]
15265 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
15266 ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm7 = ymm29[3,1,2,3,7,5,6,7]
15267 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm3 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
15268 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
15269 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
15270 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm20 {%k1}
15271 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm24[2,2,2,2]
15272 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[3]
15273 ; AVX512DQ-NEXT: vmovdqa64 %xmm5, %xmm22
15274 ; AVX512DQ-NEXT: vmovdqa64 %xmm31, %xmm4
15275 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm6[2],xmm31[3],xmm6[3]
15276 ; AVX512DQ-NEXT: vmovdqa64 %xmm6, %xmm29
15277 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
15278 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
15279 ; AVX512DQ-NEXT: # ymm0 = mem[0,1,1,3,4,5,5,7]
15280 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm1 = ymm0[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
15281 ; AVX512DQ-NEXT: vmovdqa64 %ymm0, %ymm18
15282 ; AVX512DQ-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
15283 ; AVX512DQ-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
15284 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
15285 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
15286 ; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
15287 ; AVX512DQ-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
15288 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm0 = ymm2[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
15289 ; AVX512DQ-NEXT: vmovdqa64 %ymm2, %ymm31
15290 ; AVX512DQ-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
15291 ; AVX512DQ-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
15292 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
15293 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
15294 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
15295 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
15296 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm0, %zmm20, %zmm0
15297 ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0]
15298 ; AVX512DQ-NEXT: vmovdqa64 %xmm28, %xmm1
15299 ; AVX512DQ-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm16, %xmm1 # 16-byte Folded Reload
15300 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
15301 ; AVX512DQ-NEXT: # xmm1 = xmm1[0,1],mem[2,3]
15302 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
15303 ; AVX512DQ-NEXT: vmovdqa64 %ymm27, %ymm3
15304 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
15305 ; AVX512DQ-NEXT: vmovdqa64 %ymm26, %ymm6
15306 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm15 = ymm6[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
15307 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4,5,6],ymm3[7]
15308 ; AVX512DQ-NEXT: vmovdqa64 %ymm25, %ymm6
15309 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm15 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
15310 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm13 = ymm13[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
15311 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4],ymm15[5],ymm13[6,7]
15312 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5],ymm3[6,7]
15313 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 {%k1}
15314 ; AVX512DQ-NEXT: vmovdqa64 %xmm17, %xmm3
15315 ; AVX512DQ-NEXT: vpermt2d %xmm23, %xmm16, %xmm3
15316 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
15317 ; AVX512DQ-NEXT: # xmm3 = xmm3[0,1],mem[2,3]
15318 ; AVX512DQ-NEXT: vmovdqa64 %ymm21, %ymm6
15319 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm13 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
15320 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
15321 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7]
15322 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
15323 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
15324 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4],ymm12[5],ymm11[6,7]
15325 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7]
15326 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm11[4,5,6,7]
15327 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm3, %zmm1, %zmm1
15328 ; AVX512DQ-NEXT: vmovdqa64 %xmm19, %xmm3
15329 ; AVX512DQ-NEXT: vpermt2d %xmm30, %xmm16, %xmm3
15330 ; AVX512DQ-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
15331 ; AVX512DQ-NEXT: # xmm3 = xmm3[0,1],mem[2,3]
15332 ; AVX512DQ-NEXT: vinserti32x4 $2, %xmm3, %zmm0, %zmm3
15333 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
15334 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
15335 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7]
15336 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
15337 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm6 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
15338 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm8[5],ymm6[6,7]
15339 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm9[6,7]
15340 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm3 {%k1}
15341 ; AVX512DQ-NEXT: vpermt2d %xmm29, %xmm16, %xmm4
15342 ; AVX512DQ-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm22[2],xmm24[2],xmm22[3],xmm24[3]
15343 ; AVX512DQ-NEXT: vpblendd {{.*#+}} xmm6 = xmm4[0,1],xmm6[2,3]
15344 ; AVX512DQ-NEXT: vmovdqa64 %ymm18, %ymm7
15345 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
15346 ; AVX512DQ-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
15347 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm7[7]
15348 ; AVX512DQ-NEXT: vmovdqa64 %ymm31, %ymm4
15349 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
15350 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
15351 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
15352 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm5[6,7]
15353 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
15354 ; AVX512DQ-NEXT: vinserti64x4 $0, %ymm2, %zmm3, %zmm2
15355 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15356 ; AVX512DQ-NEXT: vmovaps %zmm3, 64(%rsi)
15357 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15358 ; AVX512DQ-NEXT: vmovaps %zmm3, (%rsi)
15359 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15360 ; AVX512DQ-NEXT: vmovaps %zmm3, 64(%rdx)
15361 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15362 ; AVX512DQ-NEXT: vmovaps %zmm3, (%rdx)
15363 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15364 ; AVX512DQ-NEXT: vmovaps %zmm3, 64(%rcx)
15365 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15366 ; AVX512DQ-NEXT: vmovaps %zmm3, (%rcx)
15367 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15368 ; AVX512DQ-NEXT: vmovaps %zmm3, 64(%r8)
15369 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15370 ; AVX512DQ-NEXT: vmovaps %zmm3, (%r8)
15371 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15372 ; AVX512DQ-NEXT: vmovaps %zmm3, 64(%r9)
15373 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15374 ; AVX512DQ-NEXT: vmovaps %zmm3, (%r9)
15375 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
15376 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15377 ; AVX512DQ-NEXT: vmovaps %zmm3, 64(%rax)
15378 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
15379 ; AVX512DQ-NEXT: vmovaps %zmm3, (%rax)
15380 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
15381 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, 64(%rax)
15382 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
15383 ; AVX512DQ-NEXT: vmovaps %zmm0, (%rax)
15384 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax
15385 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 64(%rax)
15386 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rax)
15387 ; AVX512DQ-NEXT: addq $2408, %rsp # imm = 0x968
15388 ; AVX512DQ-NEXT: vzeroupper
15389 ; AVX512DQ-NEXT: retq
15391 ; AVX512DQ-FCP-LABEL: load_i16_stride8_vf64:
15392 ; AVX512DQ-FCP: # %bb.0:
15393 ; AVX512DQ-FCP-NEXT: subq $2312, %rsp # imm = 0x908
15394 ; AVX512DQ-FCP-NEXT: vmovdqa 368(%rdi), %xmm0
15395 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15396 ; AVX512DQ-FCP-NEXT: vmovdqa 352(%rdi), %xmm1
15397 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15398 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
15399 ; AVX512DQ-FCP-NEXT: vmovdqa 336(%rdi), %xmm0
15400 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15401 ; AVX512DQ-FCP-NEXT: vmovdqa 320(%rdi), %xmm1
15402 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15403 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
15404 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4]
15405 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, %xmm0
15406 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm5, %xmm0
15407 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm24
15408 ; AVX512DQ-FCP-NEXT: vmovdqa 304(%rdi), %xmm1
15409 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15410 ; AVX512DQ-FCP-NEXT: vmovdqa 288(%rdi), %xmm2
15411 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15412 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
15413 ; AVX512DQ-FCP-NEXT: vmovdqa 272(%rdi), %xmm1
15414 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15415 ; AVX512DQ-FCP-NEXT: vmovdqa 256(%rdi), %xmm2
15416 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15417 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
15418 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
15419 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm30
15420 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15421 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
15422 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
15423 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
15424 ; AVX512DQ-FCP-NEXT: vmovdqa 480(%rdi), %ymm1
15425 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15426 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,0,2]
15427 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15428 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
15429 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15430 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
15431 ; AVX512DQ-FCP-NEXT: vmovdqa 448(%rdi), %ymm2
15432 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15433 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,0,2]
15434 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15435 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
15436 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15437 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
15438 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
15439 ; AVX512DQ-FCP-NEXT: vmovdqa 416(%rdi), %ymm2
15440 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15441 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2]
15442 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15443 ; AVX512DQ-FCP-NEXT: vmovdqa 384(%rdi), %ymm2
15444 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15445 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2]
15446 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15447 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm3[0,2,2,3,4,6,6,7]
15448 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15449 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
15450 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm4[0,2,2,3,4,6,6,7]
15451 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15452 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
15453 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
15454 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
15455 ; AVX512DQ-FCP-NEXT: movb $-64, %al
15456 ; AVX512DQ-FCP-NEXT: kmovw %eax, %k1
15457 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
15458 ; AVX512DQ-FCP-NEXT: vmovdqa 112(%rdi), %xmm1
15459 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15460 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm2
15461 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15462 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
15463 ; AVX512DQ-FCP-NEXT: vmovdqa 80(%rdi), %xmm1
15464 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15465 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm2
15466 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15467 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
15468 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm1
15469 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm20
15470 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm3, %xmm5, %xmm1
15471 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm19
15472 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15473 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm2
15474 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15475 ; AVX512DQ-FCP-NEXT: vmovdqa 16(%rdi), %xmm3
15476 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15477 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm4
15478 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15479 ; AVX512DQ-FCP-NEXT: vmovdqa 48(%rdi), %xmm6
15480 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15481 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
15482 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15483 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
15484 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
15485 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm31
15486 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
15487 ; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm2
15488 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15489 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm2[0,1,0,2]
15490 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15491 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm2
15492 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15493 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm2[0,1,0,2]
15494 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15495 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,1,2,0,4,5,6,4]
15496 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
15497 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm28
15498 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[0,1,2,0,4,5,6,4]
15499 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm10[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
15500 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
15501 ; AVX512DQ-FCP-NEXT: vmovdqa 160(%rdi), %ymm3
15502 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15503 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm3[0,1,0,2]
15504 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15505 ; AVX512DQ-FCP-NEXT: vmovdqa 128(%rdi), %ymm3
15506 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15507 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm3[0,1,0,2]
15508 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15509 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm4[0,2,2,3,4,6,6,7]
15510 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
15511 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm6[0,2,2,3,4,6,6,7]
15512 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm7[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
15513 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
15514 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
15515 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
15516 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
15517 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15518 ; AVX512DQ-FCP-NEXT: vmovdqa 880(%rdi), %xmm0
15519 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15520 ; AVX512DQ-FCP-NEXT: vmovdqa 864(%rdi), %xmm1
15521 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15522 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
15523 ; AVX512DQ-FCP-NEXT: vmovdqa 848(%rdi), %xmm0
15524 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15525 ; AVX512DQ-FCP-NEXT: vmovdqa 832(%rdi), %xmm1
15526 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15527 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
15528 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm0
15529 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm26
15530 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm2
15531 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm3, %xmm5, %xmm0
15532 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm27
15533 ; AVX512DQ-FCP-NEXT: vmovdqa 816(%rdi), %xmm1
15534 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15535 ; AVX512DQ-FCP-NEXT: vmovdqa 800(%rdi), %xmm3
15536 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15537 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
15538 ; AVX512DQ-FCP-NEXT: vmovdqa 784(%rdi), %xmm1
15539 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15540 ; AVX512DQ-FCP-NEXT: vmovdqa 768(%rdi), %xmm3
15541 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15542 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
15543 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
15544 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm23
15545 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm25
15546 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
15547 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm9
15548 ; AVX512DQ-FCP-NEXT: vmovdqa 992(%rdi), %ymm0
15549 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15550 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
15551 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, (%rsp) # 32-byte Spill
15552 ; AVX512DQ-FCP-NEXT: vmovdqa 960(%rdi), %ymm0
15553 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15554 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,1,0,2]
15555 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15556 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm1[0,1,2,0,4,5,6,4]
15557 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
15558 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm3[0,1,2,0,4,5,6,4]
15559 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm5[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
15560 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
15561 ; AVX512DQ-FCP-NEXT: vmovdqa 928(%rdi), %ymm1
15562 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15563 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm3 = ymm1[0,1,0,2]
15564 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15565 ; AVX512DQ-FCP-NEXT: vmovdqa 896(%rdi), %ymm1
15566 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15567 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm1[0,1,0,2]
15568 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15569 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,2,2,3,4,6,6,7]
15570 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
15571 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm11[0,2,2,3,4,6,6,7]
15572 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
15573 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4],ymm1[5],ymm12[6,7]
15574 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
15575 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm9 {%k1}
15576 ; AVX512DQ-FCP-NEXT: vmovdqa 624(%rdi), %xmm0
15577 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15578 ; AVX512DQ-FCP-NEXT: vmovdqa 608(%rdi), %xmm1
15579 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15580 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
15581 ; AVX512DQ-FCP-NEXT: vmovdqa 592(%rdi), %xmm0
15582 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15583 ; AVX512DQ-FCP-NEXT: vmovdqa 576(%rdi), %xmm1
15584 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15585 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
15586 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm0
15587 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm29
15588 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm11, %xmm2, %xmm0
15589 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm11, %xmm16
15590 ; AVX512DQ-FCP-NEXT: vmovdqa 560(%rdi), %xmm1
15591 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15592 ; AVX512DQ-FCP-NEXT: vmovdqa 544(%rdi), %xmm2
15593 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15594 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
15595 ; AVX512DQ-FCP-NEXT: vmovdqa 528(%rdi), %xmm1
15596 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15597 ; AVX512DQ-FCP-NEXT: vmovdqa 512(%rdi), %xmm2
15598 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15599 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
15600 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm11[0],xmm2[1],xmm11[1]
15601 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm22
15602 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm11, %xmm21
15603 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm12 = xmm1[0,1],xmm0[2,3]
15604 ; AVX512DQ-FCP-NEXT: vmovdqa 736(%rdi), %ymm0
15605 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15606 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
15607 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15608 ; AVX512DQ-FCP-NEXT: vmovdqa 704(%rdi), %ymm0
15609 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15610 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,2]
15611 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15612 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
15613 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
15614 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm1, %ymm18
15615 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
15616 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
15617 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5,6],ymm0[7]
15618 ; AVX512DQ-FCP-NEXT: vmovdqa 672(%rdi), %ymm0
15619 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15620 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,1,0,2]
15621 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15622 ; AVX512DQ-FCP-NEXT: vmovdqa 640(%rdi), %ymm0
15623 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15624 ; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm11 = ymm0[0,1,0,2]
15625 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15626 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm15 = ymm1[0,2,2,3,4,6,6,7]
15627 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm15[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
15628 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm11[0,2,2,3,4,6,6,7]
15629 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
15630 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3,4],ymm0[5],ymm11[6,7]
15631 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm13[6,7]
15632 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
15633 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm9, %zmm0
15634 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15635 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm13 = [1,5,0,0]
15636 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm0
15637 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm17, %xmm13, %xmm0
15638 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm30
15639 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm14[0],xmm24[0],xmm14[1],xmm24[1]
15640 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3]
15641 ; AVX512DQ-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
15642 ; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
15643 ; AVX512DQ-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
15644 ; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
15645 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5,6],ymm9[7]
15646 ; AVX512DQ-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
15647 ; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
15648 ; AVX512DQ-FCP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
15649 ; AVX512DQ-FCP-NEXT: # ymm12 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
15650 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3,4],ymm11[5],ymm12[6,7]
15651 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
15652 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
15653 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm0 {%k1}
15654 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm12
15655 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm9
15656 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm17 # 16-byte Reload
15657 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm17, %xmm13, %xmm9
15658 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm20[0],xmm19[0],xmm20[1],xmm19[1]
15659 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm9[0,1],xmm11[2,3]
15660 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm28, %ymm11
15661 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
15662 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm10 = ymm10[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
15663 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5,6],ymm11[7]
15664 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm8 = ymm8[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
15665 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
15666 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4],ymm8[5],ymm7[6,7]
15667 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm10[6,7]
15668 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
15669 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm7, %zmm0, %zmm0
15670 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15671 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm9
15672 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm0
15673 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm11
15674 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm25, %xmm13, %xmm0
15675 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm8
15676 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm26[0],xmm27[0],xmm26[1],xmm27[1]
15677 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm19
15678 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm7[2,3]
15679 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
15680 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
15681 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm6[7]
15682 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
15683 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
15684 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
15685 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
15686 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
15687 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 {%k1}
15688 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm3
15689 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm21, %xmm13, %xmm3
15690 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm29[0],xmm16[0],xmm29[1],xmm16[1]
15691 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm6
15692 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3]
15693 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm4
15694 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
15695 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
15696 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm4[7]
15697 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm15[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
15698 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
15699 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm4[5],ymm1[6,7]
15700 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
15701 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
15702 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
15703 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15704 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm14, %xmm0
15705 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm14[2],xmm24[2],xmm14[3],xmm24[3]
15706 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15707 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6]
15708 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm24, %xmm5, %xmm0
15709 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm31 # 16-byte Reload
15710 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm30, %xmm28
15711 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm31[2],xmm30[2],xmm31[3],xmm30[3]
15712 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
15713 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
15714 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
15715 ; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,1,3,4,5,5,7]
15716 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15717 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
15718 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
15719 ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
15720 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
15721 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm29
15722 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
15723 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
15724 ; AVX512DQ-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
15725 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
15726 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
15727 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
15728 ; AVX512DQ-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
15729 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
15730 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm27
15731 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
15732 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
15733 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
15734 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm1
15735 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
15736 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm26 = xmm20[2],xmm2[2],xmm20[3],xmm2[3]
15737 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm2, %xmm5, %xmm1
15738 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm12[2],xmm17[2],xmm12[3],xmm17[3]
15739 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm12, %xmm25
15740 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm23
15741 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
15742 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
15743 ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
15744 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
15745 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm18
15746 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
15747 ; AVX512DQ-FCP-NEXT: # ymm13 = mem[0,1,1,3,4,5,5,7]
15748 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm13[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
15749 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
15750 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
15751 ; AVX512DQ-FCP-NEXT: # ymm12 = mem[3,1,2,3,7,5,6,7]
15752 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm12[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
15753 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
15754 ; AVX512DQ-FCP-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
15755 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
15756 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
15757 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
15758 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
15759 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
15760 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15761 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm0
15762 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm17 = xmm19[2],xmm8[2],xmm19[3],xmm8[3]
15763 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm8, %xmm5, %xmm0
15764 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm9, %xmm19
15765 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm11, %xmm20
15766 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm9[2],xmm11[2],xmm9[3],xmm11[3]
15767 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
15768 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm24
15769 ; AVX512DQ-FCP-NEXT: vpshufd $212, (%rsp), %ymm9 # 32-byte Folded Reload
15770 ; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
15771 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
15772 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
15773 ; AVX512DQ-FCP-NEXT: # ymm11 = mem[0,1,1,3,4,5,5,7]
15774 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
15775 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
15776 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
15777 ; AVX512DQ-FCP-NEXT: # ymm8 = mem[3,1,2,3,7,5,6,7]
15778 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm8[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
15779 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
15780 ; AVX512DQ-FCP-NEXT: # ymm7 = mem[3,1,2,3,7,5,6,7]
15781 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm7[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
15782 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
15783 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
15784 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm24 {%k1}
15785 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm6, %xmm0
15786 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm14 = xmm6[2],xmm16[2],xmm6[3],xmm16[3]
15787 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm16, %xmm5, %xmm0
15788 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm22[2],xmm21[2],xmm22[3],xmm21[3]
15789 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm1[0,1],xmm0[2,3]
15790 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
15791 ; AVX512DQ-FCP-NEXT: # ymm6 = mem[0,1,1,3,4,5,5,7]
15792 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
15793 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
15794 ; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,1,3,4,5,5,7]
15795 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm5[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
15796 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
15797 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
15798 ; AVX512DQ-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
15799 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm4[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
15800 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
15801 ; AVX512DQ-FCP-NEXT: # ymm2 = mem[3,1,2,3,7,5,6,7]
15802 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
15803 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
15804 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
15805 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
15806 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm24, %zmm0
15807 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15808 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm16 = [3,7,0,0]
15809 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm1
15810 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm28, %xmm16, %xmm1
15811 ; AVX512DQ-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
15812 ; AVX512DQ-FCP-NEXT: # xmm0 = xmm1[0,1],mem[2,3]
15813 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
15814 ; AVX512DQ-FCP-NEXT: vpshufhw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
15815 ; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
15816 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm29, %ymm3
15817 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
15818 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
15819 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm3
15820 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
15821 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm27, %ymm15
15822 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm15[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
15823 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm15[0,1,2,3,4],ymm3[5],ymm15[6,7]
15824 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
15825 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
15826 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm1
15827 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm23, %xmm16, %xmm1
15828 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm3
15829 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
15830 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm18, %ymm3
15831 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
15832 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
15833 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm13[0,1,2,3,4,5,6],ymm3[7]
15834 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm12 = ymm12[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
15835 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
15836 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm12[5],ymm10[6,7]
15837 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3,4,5],ymm3[6,7]
15838 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
15839 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
15840 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15841 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm0
15842 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm20, %xmm16, %xmm0
15843 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm1
15844 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
15845 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
15846 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
15847 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm11[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
15848 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
15849 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm8[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
15850 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
15851 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4],ymm3[5],ymm7[6,7]
15852 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
15853 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
15854 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm1
15855 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm21, %xmm16, %xmm1
15856 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm14[2,3]
15857 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
15858 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
15859 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5,6],ymm3[7]
15860 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
15861 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
15862 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5],ymm2[6,7]
15863 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
15864 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
15865 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
15866 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15867 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15868 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
15869 ; AVX512DQ-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15870 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15871 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm9 # 16-byte Folded Reload
15872 ; AVX512DQ-FCP-NEXT: # xmm9 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15873 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15874 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
15875 ; AVX512DQ-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15876 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15877 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
15878 ; AVX512DQ-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15879 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, %xmm0
15880 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,0,4]
15881 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm5, %xmm0
15882 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm29
15883 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
15884 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm23
15885 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm19
15886 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
15887 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
15888 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
15889 ; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,1,3]
15890 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15891 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
15892 ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,1,3]
15893 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15894 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,2,0,4,5,6,4]
15895 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15896 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
15897 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
15898 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15899 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
15900 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
15901 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
15902 ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,1,3]
15903 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15904 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
15905 ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,1,3]
15906 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15907 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
15908 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15909 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
15910 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
15911 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15912 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
15913 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
15914 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
15915 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
15916 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15917 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm6 # 16-byte Folded Reload
15918 ; AVX512DQ-FCP-NEXT: # xmm6 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
15919 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15920 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm2 # 16-byte Folded Reload
15921 ; AVX512DQ-FCP-NEXT: # xmm2 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
15922 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15923 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm3 # 16-byte Folded Reload
15924 ; AVX512DQ-FCP-NEXT: # xmm3 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
15925 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
15926 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm4 # 16-byte Folded Reload
15927 ; AVX512DQ-FCP-NEXT: # xmm4 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
15928 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, %xmm1
15929 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm27
15930 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm6, %xmm5, %xmm1
15931 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm6, %xmm18
15932 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
15933 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm22
15934 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm24
15935 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
15936 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
15937 ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,1,3]
15938 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15939 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
15940 ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,1,3]
15941 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15942 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,1,2,0,4,5,6,4]
15943 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15944 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
15945 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm3[0,1,2,0,4,5,6,4]
15946 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
15947 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm26
15948 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
15949 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
15950 ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,1,3]
15951 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15952 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
15953 ; AVX512DQ-FCP-NEXT: # ymm4 = mem[0,1,1,3]
15954 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15955 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm11 = ymm3[0,2,2,3,4,6,6,7]
15956 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm11[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
15957 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm10 = ymm4[0,2,2,3,4,6,6,7]
15958 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm10[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
15959 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
15960 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
15961 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
15962 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
15963 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
15964 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15965 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
15966 ; AVX512DQ-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15967 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15968 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
15969 ; AVX512DQ-FCP-NEXT: # xmm1 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15970 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15971 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
15972 ; AVX512DQ-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15973 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15974 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
15975 ; AVX512DQ-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
15976 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, %xmm0
15977 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm31
15978 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm4, %xmm5, %xmm0
15979 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm20
15980 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
15981 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm21
15982 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
15983 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm25
15984 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
15985 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm12
15986 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
15987 ; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,1,3]
15988 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15989 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
15990 ; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,1,3]
15991 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
15992 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm14 = ymm0[0,1,2,0,4,5,6,4]
15993 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
15994 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm8 = ymm1[0,1,2,0,4,5,6,4]
15995 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
15996 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
15997 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
15998 ; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,1,3]
15999 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16000 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
16001 ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,1,3]
16002 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16003 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm7 = ymm1[0,2,2,3,4,6,6,7]
16004 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm7[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
16005 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm6 = ymm2[0,2,2,3,4,6,6,7]
16006 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm6[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
16007 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4],ymm1[5],ymm2[6,7]
16008 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
16009 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm12 {%k1}
16010 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
16011 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload
16012 ; AVX512DQ-FCP-NEXT: # xmm2 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
16013 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
16014 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
16015 ; AVX512DQ-FCP-NEXT: # xmm3 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
16016 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
16017 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm13 # 16-byte Folded Reload
16018 ; AVX512DQ-FCP-NEXT: # xmm13 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
16019 ; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
16020 ; AVX512DQ-FCP-NEXT: vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload
16021 ; AVX512DQ-FCP-NEXT: # xmm4 = xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
16022 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm1
16023 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm2, %xmm3, %xmm1
16024 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm17
16025 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm16
16026 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm4[0],xmm13[0],xmm4[1],xmm13[1]
16027 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm13, %xmm28
16028 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm30
16029 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16030 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm13 = xmm0[0,1],xmm1[2,3]
16031 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
16032 ; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,1,3]
16033 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16034 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
16035 ; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,1,3]
16036 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16037 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[0,1,2,0,4,5,6,4]
16038 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5,6,4,8,9,10,11,12,13,14,12]
16039 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm4 = ymm1[0,1,2,0,4,5,6,4]
16040 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,6,4,6,7,8,9,10,11,14,12,14,15]
16041 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm0[7]
16042 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload
16043 ; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,1,3]
16044 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16045 ; AVX512DQ-FCP-NEXT: vpermq $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
16046 ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,1,3]
16047 ; AVX512DQ-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
16048 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
16049 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm3[0,1,0,2,4,5,6,7,8,9,8,10,12,13,14,15]
16050 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[0,2,2,3,4,6,6,7]
16051 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm15 = ymm2[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
16052 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm15[0,1,2,3,4],ymm0[5],ymm15[6,7]
16053 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
16054 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm0[4,5,6,7]
16055 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm12, %zmm0
16056 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16057 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm0
16058 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, (%rsp) # 16-byte Spill
16059 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm15 = [1,5,0,0]
16060 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm19, %xmm15, %xmm0
16061 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16062 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm29[0],xmm9[1],xmm29[1]
16063 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
16064 ; AVX512DQ-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
16065 ; AVX512DQ-FCP-NEXT: # ymm1 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
16066 ; AVX512DQ-FCP-NEXT: vpshufhw $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
16067 ; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
16068 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5,6],ymm1[7]
16069 ; AVX512DQ-FCP-NEXT: vpshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
16070 ; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
16071 ; AVX512DQ-FCP-NEXT: vpshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
16072 ; AVX512DQ-FCP-NEXT: # ymm13 = mem[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
16073 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4],ymm12[5],ymm13[6,7]
16074 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
16075 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm1[6,7]
16076 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
16077 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm1
16078 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm24, %xmm15, %xmm1
16079 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm27[0],xmm18[0],xmm27[1],xmm18[1]
16080 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm12[2,3]
16081 ; AVX512DQ-FCP-NEXT: vpshufhw $116, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
16082 ; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
16083 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm13
16084 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm13[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
16085 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5,6],ymm12[7]
16086 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm11 = ymm11[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
16087 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm10[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
16088 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4],ymm11[5],ymm10[6,7]
16089 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm12[6,7]
16090 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm10[4,5,6,7]
16091 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
16092 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16093 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm0
16094 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm25, %xmm15, %xmm0
16095 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm13
16096 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm31[0],xmm20[0],xmm31[1],xmm20[1]
16097 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm31, %xmm21
16098 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
16099 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm14[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
16100 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm8 = ymm8[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
16101 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,6],ymm1[7]
16102 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
16103 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
16104 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4],ymm7[5],ymm6[6,7]
16105 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3,4,5],ymm1[6,7]
16106 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
16107 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
16108 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm15, %xmm6
16109 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm28, %xmm30, %xmm6
16110 ; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm17[0],xmm16[0],xmm17[1],xmm16[1]
16111 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm7
16112 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm6[0,1],xmm1[2,3]
16113 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,7,5,8,9,10,11,12,13,15,13]
16114 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,7,5,6,7,8,9,10,11,15,13,14,15]
16115 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7]
16116 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
16117 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm2[1,3,2,3,4,5,6,7,9,11,10,11,12,13,14,15]
16118 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
16119 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm4[6,7]
16120 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
16121 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
16122 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16123 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm9, %xmm0
16124 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm9[2],xmm29[2],xmm9[3],xmm29[3]
16125 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
16126 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm5 = [0,0,2,6]
16127 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm29, %xmm5, %xmm0
16128 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm23[2],xmm19[2],xmm23[3],xmm19[3]
16129 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
16130 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm0
16131 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
16132 ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,1,3,4,5,5,7]
16133 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
16134 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm2, %ymm31
16135 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
16136 ; AVX512DQ-FCP-NEXT: # ymm3 = mem[0,1,1,3,4,5,5,7]
16137 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
16138 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm30
16139 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5,6],ymm1[7]
16140 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
16141 ; AVX512DQ-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
16142 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm2 = ymm3[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
16143 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm3, %ymm26
16144 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
16145 ; AVX512DQ-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
16146 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
16147 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm4, %ymm23
16148 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4],ymm2[5],ymm3[6,7]
16149 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
16150 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1}
16151 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm1
16152 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm20 = xmm27[2],xmm18[2],xmm27[3],xmm18[3]
16153 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm18, %xmm5, %xmm1
16154 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm22[2],xmm24[2],xmm22[3],xmm24[3]
16155 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm19
16156 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm18
16157 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
16158 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
16159 ; AVX512DQ-FCP-NEXT: # ymm12 = mem[0,1,1,3,4,5,5,7]
16160 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
16161 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
16162 ; AVX512DQ-FCP-NEXT: # ymm9 = mem[0,1,1,3,4,5,5,7]
16163 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm9[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
16164 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,6],ymm2[7]
16165 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload
16166 ; AVX512DQ-FCP-NEXT: # ymm10 = mem[3,1,2,3,7,5,6,7]
16167 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm10[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
16168 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
16169 ; AVX512DQ-FCP-NEXT: # ymm11 = mem[3,1,2,3,7,5,6,7]
16170 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm11[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
16171 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4],ymm3[5],ymm4[6,7]
16172 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
16173 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
16174 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm1, %zmm0, %zmm0
16175 ; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
16176 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm0
16177 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm16 = xmm21[2],xmm13[2],xmm21[3],xmm13[3]
16178 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm13, %xmm5, %xmm0
16179 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm22
16180 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm24 # 16-byte Reload
16181 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm24[2],xmm25[2],xmm24[3],xmm25[3]
16182 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
16183 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm0, %zmm21
16184 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
16185 ; AVX512DQ-FCP-NEXT: # ymm14 = mem[0,1,1,3,4,5,5,7]
16186 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm0 = ymm14[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
16187 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
16188 ; AVX512DQ-FCP-NEXT: # ymm15 = mem[0,1,1,3,4,5,5,7]
16189 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm15[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
16190 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5,6],ymm0[7]
16191 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Folded Reload
16192 ; AVX512DQ-FCP-NEXT: # ymm6 = mem[3,1,2,3,7,5,6,7]
16193 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm1 = ymm6[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
16194 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Folded Reload
16195 ; AVX512DQ-FCP-NEXT: # ymm4 = mem[3,1,2,3,7,5,6,7]
16196 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm4[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
16197 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4],ymm1[5],ymm3[6,7]
16198 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
16199 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm21 {%k1}
16200 ; AVX512DQ-FCP-NEXT: vmovdqa %xmm5, %xmm1
16201 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm7, %xmm25
16202 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm17, %xmm27
16203 ; AVX512DQ-FCP-NEXT: vpermi2d %xmm7, %xmm17, %xmm1
16204 ; AVX512DQ-FCP-NEXT: vmovdqa64 {{[-0-9]+}}(%r{{[sb]}}p), %xmm29 # 16-byte Reload
16205 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm0 = xmm29[2],xmm28[2],xmm29[3],xmm28[3]
16206 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm0[0,1],xmm1[2,3]
16207 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
16208 ; AVX512DQ-FCP-NEXT: # ymm8 = mem[0,1,1,3,4,5,5,7]
16209 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5,4,6,8,9,10,11,12,13,12,14]
16210 ; AVX512DQ-FCP-NEXT: vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
16211 ; AVX512DQ-FCP-NEXT: # ymm7 = mem[0,1,1,3,4,5,5,7]
16212 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm3 = ymm7[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
16213 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
16214 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
16215 ; AVX512DQ-FCP-NEXT: # ymm5 = mem[3,1,2,3,7,5,6,7]
16216 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm5[0,1,2,0,4,5,6,7,8,9,10,8,12,13,14,15]
16217 ; AVX512DQ-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
16218 ; AVX512DQ-FCP-NEXT: # ymm3 = mem[3,1,2,3,7,5,6,7]
16219 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm3[2,0,2,3,4,5,6,7,10,8,10,11,12,13,14,15]
16220 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3,4],ymm0[5],ymm13[6,7]
16221 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
16222 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
16223 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm21, %zmm21
16224 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} xmm17 = [3,7,0,0]
16225 ; AVX512DQ-FCP-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload
16226 ; AVX512DQ-FCP-NEXT: vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm17, %xmm0 # 16-byte Folded Reload
16227 ; AVX512DQ-FCP-NEXT: vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
16228 ; AVX512DQ-FCP-NEXT: # xmm1 = xmm0[0,1],mem[2,3]
16229 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
16230 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm31, %ymm0
16231 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
16232 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm30, %ymm0
16233 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm13 = ymm0[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
16234 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm13[0,1,2,3,4,5,6],ymm2[7]
16235 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm26, %ymm0
16236 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm13 = ymm0[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
16237 ; AVX512DQ-FCP-NEXT: vmovdqa64 %ymm23, %ymm0
16238 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
16239 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm13[5],ymm0[6,7]
16240 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
16241 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm1 {%k1}
16242 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm19, %xmm0
16243 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm18, %xmm17, %xmm0
16244 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm20, %xmm2
16245 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
16246 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
16247 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
16248 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7]
16249 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm9 = ymm10[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
16250 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm10 = ymm11[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
16251 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5],ymm10[6,7]
16252 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm2[6,7]
16253 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
16254 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm1, %zmm0
16255 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm1
16256 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm22, %xmm17, %xmm1
16257 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm16, %xmm2
16258 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
16259 ; AVX512DQ-FCP-NEXT: vinserti32x4 $2, %xmm1, %zmm0, %zmm1
16260 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm2 = ymm14[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
16261 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm9 = ymm15[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
16262 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5,6],ymm2[7]
16263 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm6 = ymm6[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
16264 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm4 = ymm4[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
16265 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5],ymm4[6,7]
16266 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5],ymm2[6,7]
16267 ; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 {%k1}
16268 ; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm29, %xmm4
16269 ; AVX512DQ-FCP-NEXT: vpermt2d %xmm28, %xmm17, %xmm4
16270 ; AVX512DQ-FCP-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm27[2],xmm25[2],xmm27[3],xmm25[3]
16271 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm4[0,1],xmm2[2,3]
16272 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
16273 ; AVX512DQ-FCP-NEXT: vpshufhw {{.*#+}} ymm6 = ymm7[0,1,2,3,5,7,6,7,8,9,10,11,13,15,14,15]
16274 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
16275 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm5 = ymm5[0,1,3,1,4,5,6,7,8,9,11,9,12,13,14,15]
16276 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} ymm3 = ymm3[3,1,2,3,4,5,6,7,11,9,10,11,12,13,14,15]
16277 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm5[5],ymm3[6,7]
16278 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
16279 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
16280 ; AVX512DQ-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm1, %zmm1
16281 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16282 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rsi)
16283 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16284 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rsi)
16285 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16286 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rdx)
16287 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16288 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rdx)
16289 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16290 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rcx)
16291 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16292 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rcx)
16293 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16294 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%r8)
16295 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16296 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%r8)
16297 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16298 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%r9)
16299 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16300 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%r9)
16301 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
16302 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16303 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, 64(%rax)
16304 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16305 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rax)
16306 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
16307 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 64(%rax)
16308 ; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
16309 ; AVX512DQ-FCP-NEXT: vmovaps %zmm2, (%rax)
16310 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
16311 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax)
16312 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
16313 ; AVX512DQ-FCP-NEXT: addq $2312, %rsp # imm = 0x908
16314 ; AVX512DQ-FCP-NEXT: vzeroupper
16315 ; AVX512DQ-FCP-NEXT: retq
16317 ; AVX512BW-LABEL: load_i16_stride8_vf64:
16318 ; AVX512BW: # %bb.0:
16319 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
16320 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
16321 ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
16322 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
16323 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm29
16324 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm1
16325 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm30
16326 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm31
16327 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm3
16328 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm7
16329 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm6
16330 ; AVX512BW-NEXT: vmovdqa64 576(%rdi), %zmm9
16331 ; AVX512BW-NEXT: vmovdqa64 512(%rdi), %zmm5
16332 ; AVX512BW-NEXT: vmovdqa64 704(%rdi), %zmm12
16333 ; AVX512BW-NEXT: vmovdqa64 640(%rdi), %zmm2
16334 ; AVX512BW-NEXT: vmovdqa64 832(%rdi), %zmm14
16335 ; AVX512BW-NEXT: vmovdqa64 768(%rdi), %zmm11
16336 ; AVX512BW-NEXT: vmovdqa64 960(%rdi), %zmm16
16337 ; AVX512BW-NEXT: vmovdqa64 896(%rdi), %zmm15
16338 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
16339 ; AVX512BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16340 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm17
16341 ; AVX512BW-NEXT: vpermt2w %zmm16, %zmm13, %zmm17
16342 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm18
16343 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm13, %zmm18
16344 ; AVX512BW-NEXT: movb $-64, %dil
16345 ; AVX512BW-NEXT: kmovd %edi, %k1
16346 ; AVX512BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1}
16347 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10
16348 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm13, %zmm10
16349 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm8
16350 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm13, %zmm8
16351 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
16352 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28
16353 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8
16354 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm8
16355 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
16356 ; AVX512BW-NEXT: vpermt2w %zmm31, %zmm13, %zmm10
16357 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16358 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8
16359 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm13, %zmm8
16360 ; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm13
16361 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16362 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17
16363 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
16364 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16365 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm10
16366 ; AVX512BW-NEXT: vpermt2w %zmm16, %zmm8, %zmm10
16367 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm13
16368 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm8, %zmm13
16369 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
16370 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10
16371 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm8, %zmm10
16372 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm4
16373 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm8, %zmm4
16374 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
16375 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18
16376 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm4
16377 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm4
16378 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
16379 ; AVX512BW-NEXT: vpermt2w %zmm31, %zmm8, %zmm10
16380 ; AVX512BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1}
16381 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm4
16382 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm8, %zmm4
16383 ; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm8
16384 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
16385 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19
16386 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
16387 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16388 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8
16389 ; AVX512BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16390 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10
16391 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16392 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16393 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8
16394 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16395 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13
16396 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16397 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16398 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20
16399 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8
16400 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16401 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
16402 ; AVX512BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16403 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16404 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8
16405 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16406 ; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16407 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16408 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21
16409 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
16410 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16411 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8
16412 ; AVX512BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16413 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10
16414 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16415 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16416 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8
16417 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16418 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13
16419 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16420 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16421 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22
16422 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8
16423 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16424 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
16425 ; AVX512BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16426 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16427 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8
16428 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16429 ; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16430 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16431 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23
16432 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
16433 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16434 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8
16435 ; AVX512BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16436 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10
16437 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16438 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16439 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8
16440 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16441 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13
16442 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16443 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16444 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24
16445 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8
16446 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16447 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
16448 ; AVX512BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16449 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16450 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8
16451 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16452 ; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16453 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16454 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25
16455 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
16456 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16457 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8
16458 ; AVX512BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16459 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10
16460 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16461 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16462 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8
16463 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16464 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13
16465 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16466 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16467 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26
16468 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm8
16469 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16470 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm10
16471 ; AVX512BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16472 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16473 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm8
16474 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16475 ; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16476 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16477 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27
16478 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
16479 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16480 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm8
16481 ; AVX512BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16482 ; AVX512BW-NEXT: vmovdqa64 %zmm11, %zmm10
16483 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16484 ; AVX512BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16485 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm8
16486 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16487 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm13
16488 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16489 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16490 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
16491 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm10
16492 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm10
16493 ; AVX512BW-NEXT: vmovdqa64 %zmm3, %zmm13
16494 ; AVX512BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm13
16495 ; AVX512BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
16496 ; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm10
16497 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm10
16498 ; AVX512BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16499 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
16500 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4
16501 ; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
16502 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16503 ; AVX512BW-NEXT: vpermt2w %zmm16, %zmm10, %zmm15
16504 ; AVX512BW-NEXT: vpermt2w %zmm14, %zmm10, %zmm11
16505 ; AVX512BW-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1}
16506 ; AVX512BW-NEXT: vpermt2w %zmm12, %zmm10, %zmm2
16507 ; AVX512BW-NEXT: vpermt2w %zmm9, %zmm10, %zmm5
16508 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
16509 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
16510 ; AVX512BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm6
16511 ; AVX512BW-NEXT: vpermt2w %zmm31, %zmm10, %zmm3
16512 ; AVX512BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
16513 ; AVX512BW-NEXT: vpermt2w %zmm30, %zmm10, %zmm1
16514 ; AVX512BW-NEXT: vpermt2w %zmm29, %zmm10, %zmm0
16515 ; AVX512BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16516 ; AVX512BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
16517 ; AVX512BW-NEXT: vmovdqa64 %zmm28, 64(%rsi)
16518 ; AVX512BW-NEXT: vmovdqa64 %zmm17, (%rsi)
16519 ; AVX512BW-NEXT: vmovdqa64 %zmm18, 64(%rdx)
16520 ; AVX512BW-NEXT: vmovdqa64 %zmm19, (%rdx)
16521 ; AVX512BW-NEXT: vmovdqa64 %zmm20, 64(%rcx)
16522 ; AVX512BW-NEXT: vmovdqa64 %zmm21, (%rcx)
16523 ; AVX512BW-NEXT: vmovdqa64 %zmm22, 64(%r8)
16524 ; AVX512BW-NEXT: vmovdqa64 %zmm23, (%r8)
16525 ; AVX512BW-NEXT: vmovdqa64 %zmm24, 64(%r9)
16526 ; AVX512BW-NEXT: vmovdqa64 %zmm25, (%r9)
16527 ; AVX512BW-NEXT: vmovdqa64 %zmm26, 64(%r11)
16528 ; AVX512BW-NEXT: vmovdqa64 %zmm27, (%r11)
16529 ; AVX512BW-NEXT: vmovdqa64 %zmm8, 64(%r10)
16530 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%r10)
16531 ; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax)
16532 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax)
16533 ; AVX512BW-NEXT: vzeroupper
16534 ; AVX512BW-NEXT: retq
16536 ; AVX512BW-FCP-LABEL: load_i16_stride8_vf64:
16537 ; AVX512BW-FCP: # %bb.0:
16538 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
16539 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
16540 ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
16541 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
16542 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29
16543 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
16544 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30
16545 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31
16546 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3
16547 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
16548 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6
16549 ; AVX512BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9
16550 ; AVX512BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5
16551 ; AVX512BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12
16552 ; AVX512BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2
16553 ; AVX512BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14
16554 ; AVX512BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11
16555 ; AVX512BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm16
16556 ; AVX512BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm15
16557 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
16558 ; AVX512BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16559 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17
16560 ; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm13, %zmm17
16561 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18
16562 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm13, %zmm18
16563 ; AVX512BW-FCP-NEXT: movb $-64, %dil
16564 ; AVX512BW-FCP-NEXT: kmovd %edi, %k1
16565 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1}
16566 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
16567 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm13, %zmm10
16568 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8
16569 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm13, %zmm8
16570 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
16571 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28
16572 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
16573 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm13, %zmm8
16574 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
16575 ; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm13, %zmm10
16576 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16577 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
16578 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm13, %zmm8
16579 ; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm13
16580 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16581 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17
16582 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
16583 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16584 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10
16585 ; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm8, %zmm10
16586 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13
16587 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm8, %zmm13
16588 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
16589 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
16590 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm8, %zmm10
16591 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4
16592 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm8, %zmm4
16593 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
16594 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18
16595 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4
16596 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm8, %zmm4
16597 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
16598 ; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm8, %zmm10
16599 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1}
16600 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
16601 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm8, %zmm4
16602 ; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm8
16603 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
16604 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19
16605 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
16606 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16607 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
16608 ; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16609 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
16610 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16611 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16612 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
16613 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16614 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
16615 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16616 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16617 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20
16618 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
16619 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16620 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
16621 ; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16622 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16623 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
16624 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16625 ; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16626 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16627 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21
16628 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
16629 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16630 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
16631 ; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16632 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
16633 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16634 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16635 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
16636 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16637 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
16638 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16639 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16640 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22
16641 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
16642 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16643 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
16644 ; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16645 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16646 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
16647 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16648 ; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16649 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16650 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23
16651 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
16652 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16653 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
16654 ; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16655 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
16656 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16657 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16658 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
16659 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16660 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
16661 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16662 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16663 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24
16664 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
16665 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16666 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
16667 ; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16668 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16669 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
16670 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16671 ; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16672 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16673 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25
16674 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
16675 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16676 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
16677 ; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16678 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
16679 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16680 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16681 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
16682 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16683 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
16684 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16685 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16686 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26
16687 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
16688 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16689 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
16690 ; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16691 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16692 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
16693 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16694 ; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16695 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16696 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27
16697 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
16698 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16699 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
16700 ; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16701 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
16702 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16703 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16704 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
16705 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16706 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
16707 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16708 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16709 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
16710 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10
16711 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm10
16712 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
16713 ; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm13
16714 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
16715 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
16716 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm10
16717 ; AVX512BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16718 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
16719 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4
16720 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
16721 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16722 ; AVX512BW-FCP-NEXT: vpermt2w %zmm16, %zmm10, %zmm15
16723 ; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm10, %zmm11
16724 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1}
16725 ; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm10, %zmm2
16726 ; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm10, %zmm5
16727 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
16728 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
16729 ; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm10, %zmm6
16730 ; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm10, %zmm3
16731 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
16732 ; AVX512BW-FCP-NEXT: vpermt2w %zmm30, %zmm10, %zmm1
16733 ; AVX512BW-FCP-NEXT: vpermt2w %zmm29, %zmm10, %zmm0
16734 ; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16735 ; AVX512BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
16736 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm28, 64(%rsi)
16737 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, (%rsi)
16738 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rdx)
16739 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx)
16740 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rcx)
16741 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, (%rcx)
16742 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8)
16743 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm23, (%r8)
16744 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%r9)
16745 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm25, (%r9)
16746 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%r11)
16747 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm27, (%r11)
16748 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%r10)
16749 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10)
16750 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax)
16751 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
16752 ; AVX512BW-FCP-NEXT: vzeroupper
16753 ; AVX512BW-FCP-NEXT: retq
16755 ; AVX512DQ-BW-LABEL: load_i16_stride8_vf64:
16756 ; AVX512DQ-BW: # %bb.0:
16757 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax
16758 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10
16759 ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r11
16760 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
16761 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm29
16762 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm1
16763 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm30
16764 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm31
16765 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm3
16766 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm7
16767 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm6
16768 ; AVX512DQ-BW-NEXT: vmovdqa64 576(%rdi), %zmm9
16769 ; AVX512DQ-BW-NEXT: vmovdqa64 512(%rdi), %zmm5
16770 ; AVX512DQ-BW-NEXT: vmovdqa64 704(%rdi), %zmm12
16771 ; AVX512DQ-BW-NEXT: vmovdqa64 640(%rdi), %zmm2
16772 ; AVX512DQ-BW-NEXT: vmovdqa64 832(%rdi), %zmm14
16773 ; AVX512DQ-BW-NEXT: vmovdqa64 768(%rdi), %zmm11
16774 ; AVX512DQ-BW-NEXT: vmovdqa64 960(%rdi), %zmm16
16775 ; AVX512DQ-BW-NEXT: vmovdqa64 896(%rdi), %zmm15
16776 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
16777 ; AVX512DQ-BW-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16778 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm17
16779 ; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm13, %zmm17
16780 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm18
16781 ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm13, %zmm18
16782 ; AVX512DQ-BW-NEXT: movb $-64, %dil
16783 ; AVX512DQ-BW-NEXT: kmovd %edi, %k1
16784 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1}
16785 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10
16786 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm13, %zmm10
16787 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm8
16788 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm13, %zmm8
16789 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
16790 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28
16791 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8
16792 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm13, %zmm8
16793 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10
16794 ; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm13, %zmm10
16795 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16796 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8
16797 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm13, %zmm8
16798 ; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm13
16799 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16800 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17
16801 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
16802 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16803 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm10
16804 ; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm8, %zmm10
16805 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm13
16806 ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm8, %zmm13
16807 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
16808 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10
16809 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm8, %zmm10
16810 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm4
16811 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm8, %zmm4
16812 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
16813 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18
16814 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm4
16815 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm4
16816 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10
16817 ; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm8, %zmm10
16818 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1}
16819 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm4
16820 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm8, %zmm4
16821 ; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm8
16822 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
16823 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19
16824 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
16825 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16826 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8
16827 ; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16828 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10
16829 ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16830 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16831 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8
16832 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16833 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13
16834 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16835 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16836 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20
16837 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8
16838 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16839 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10
16840 ; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16841 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16842 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8
16843 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16844 ; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16845 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16846 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21
16847 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
16848 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16849 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8
16850 ; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16851 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10
16852 ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16853 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16854 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8
16855 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16856 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13
16857 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16858 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16859 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22
16860 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8
16861 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16862 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10
16863 ; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16864 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16865 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8
16866 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16867 ; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16868 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16869 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23
16870 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
16871 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16872 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8
16873 ; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16874 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10
16875 ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16876 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16877 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8
16878 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16879 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13
16880 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16881 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16882 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24
16883 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8
16884 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16885 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10
16886 ; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16887 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16888 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8
16889 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16890 ; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16891 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16892 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25
16893 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
16894 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16895 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8
16896 ; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16897 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10
16898 ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16899 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16900 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8
16901 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16902 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13
16903 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16904 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16905 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26
16906 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm8
16907 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
16908 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm10
16909 ; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
16910 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16911 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm8
16912 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
16913 ; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16914 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
16915 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27
16916 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
16917 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16918 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm8
16919 ; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
16920 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, %zmm10
16921 ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
16922 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
16923 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm8
16924 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
16925 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm13
16926 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
16927 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
16928 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
16929 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm10
16930 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm4, %zmm10
16931 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, %zmm13
16932 ; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm4, %zmm13
16933 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
16934 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm1, %zmm10
16935 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm4, %zmm10
16936 ; AVX512DQ-BW-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
16937 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
16938 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4
16939 ; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
16940 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16941 ; AVX512DQ-BW-NEXT: vpermt2w %zmm16, %zmm10, %zmm15
16942 ; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm10, %zmm11
16943 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1}
16944 ; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm10, %zmm2
16945 ; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm10, %zmm5
16946 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
16947 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
16948 ; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm10, %zmm6
16949 ; AVX512DQ-BW-NEXT: vpermt2w %zmm31, %zmm10, %zmm3
16950 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
16951 ; AVX512DQ-BW-NEXT: vpermt2w %zmm30, %zmm10, %zmm1
16952 ; AVX512DQ-BW-NEXT: vpermt2w %zmm29, %zmm10, %zmm0
16953 ; AVX512DQ-BW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
16954 ; AVX512DQ-BW-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
16955 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm28, 64(%rsi)
16956 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, (%rsi)
16957 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 64(%rdx)
16958 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm19, (%rdx)
16959 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 64(%rcx)
16960 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm21, (%rcx)
16961 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm22, 64(%r8)
16962 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, (%r8)
16963 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm24, 64(%r9)
16964 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm25, (%r9)
16965 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm26, 64(%r11)
16966 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm27, (%r11)
16967 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, 64(%r10)
16968 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%r10)
16969 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rax)
16970 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax)
16971 ; AVX512DQ-BW-NEXT: vzeroupper
16972 ; AVX512DQ-BW-NEXT: retq
16974 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride8_vf64:
16975 ; AVX512DQ-BW-FCP: # %bb.0:
16976 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax
16977 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10
16978 ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r11
16979 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
16980 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm29
16981 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
16982 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm30
16983 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm31
16984 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm3
16985 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm7
16986 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm6
16987 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 576(%rdi), %zmm9
16988 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 512(%rdi), %zmm5
16989 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 704(%rdi), %zmm12
16990 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 640(%rdi), %zmm2
16991 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 832(%rdi), %zmm14
16992 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 768(%rdi), %zmm11
16993 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 960(%rdi), %zmm16
16994 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 896(%rdi), %zmm15
16995 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm13 = [0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56,0,8,16,24,32,40,48,56]
16996 ; AVX512DQ-BW-FCP-NEXT: # zmm13 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
16997 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm17
16998 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm13, %zmm17
16999 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm18
17000 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm13, %zmm18
17001 ; AVX512DQ-BW-FCP-NEXT: movb $-64, %dil
17002 ; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1
17003 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, %zmm18 {%k1}
17004 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
17005 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm13, %zmm10
17006 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm8
17007 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm13, %zmm8
17008 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3],ymm10[4,5,6,7]
17009 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm18, %zmm28
17010 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
17011 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm13, %zmm8
17012 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
17013 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm13, %zmm10
17014 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
17015 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
17016 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm13, %zmm8
17017 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm13
17018 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
17019 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm17
17020 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm8 = [1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57,1,9,17,25,33,41,49,57]
17021 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17022 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm10
17023 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm8, %zmm10
17024 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm13
17025 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm8, %zmm13
17026 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
17027 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
17028 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm8, %zmm10
17029 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm4
17030 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm8, %zmm4
17031 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
17032 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm18
17033 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm4
17034 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm8, %zmm4
17035 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
17036 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm8, %zmm10
17037 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, %zmm10 {%k1}
17038 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm4
17039 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm8, %zmm4
17040 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm8
17041 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
17042 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm19
17043 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58,2,10,18,26,34,42,50,58]
17044 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17045 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
17046 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
17047 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
17048 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
17049 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
17050 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
17051 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
17052 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
17053 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
17054 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
17055 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm20
17056 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
17057 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
17058 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
17059 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
17060 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
17061 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
17062 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
17063 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
17064 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
17065 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm21
17066 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59,3,11,19,27,35,43,51,59]
17067 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17068 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
17069 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
17070 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
17071 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
17072 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
17073 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
17074 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
17075 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
17076 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
17077 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
17078 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm22
17079 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
17080 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
17081 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
17082 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
17083 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
17084 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
17085 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
17086 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
17087 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
17088 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm23
17089 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60,4,12,20,28,36,44,52,60]
17090 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17091 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
17092 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
17093 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
17094 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
17095 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
17096 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
17097 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
17098 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
17099 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
17100 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
17101 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm24
17102 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
17103 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
17104 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
17105 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
17106 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
17107 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
17108 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
17109 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
17110 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
17111 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm25
17112 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61,5,13,21,29,37,45,53,61]
17113 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17114 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
17115 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
17116 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
17117 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
17118 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
17119 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
17120 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
17121 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
17122 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
17123 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
17124 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm26
17125 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm8
17126 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm8
17127 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm10
17128 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm10
17129 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
17130 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm8
17131 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm8
17132 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
17133 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm8[4,5,6,7]
17134 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm10, %zmm27
17135 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62,6,14,22,30,38,46,54,62]
17136 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17137 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm8
17138 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm4, %zmm8
17139 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, %zmm10
17140 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm4, %zmm10
17141 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, %zmm10 {%k1}
17142 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm8
17143 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm4, %zmm8
17144 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm13
17145 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm4, %zmm13
17146 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm13[0,1,2,3],ymm8[4,5,6,7]
17147 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm8, %zmm10, %zmm8
17148 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm10
17149 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm4, %zmm10
17150 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, %zmm13
17151 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm4, %zmm13
17152 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, %zmm13 {%k1}
17153 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, %zmm10
17154 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm4, %zmm10
17155 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm29, %zmm0, %zmm4
17156 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm10[4,5,6,7]
17157 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm4, %zmm13, %zmm4
17158 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63,7,15,23,31,39,47,55,63]
17159 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
17160 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm16, %zmm10, %zmm15
17161 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm10, %zmm11
17162 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, %zmm11 {%k1}
17163 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm10, %zmm2
17164 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm10, %zmm5
17165 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
17166 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm2, %zmm11, %zmm2
17167 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm10, %zmm6
17168 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm10, %zmm3
17169 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, %zmm3 {%k1}
17170 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm30, %zmm10, %zmm1
17171 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm29, %zmm10, %zmm0
17172 ; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
17173 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $0, %ymm0, %zmm3, %zmm0
17174 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm28, 64(%rsi)
17175 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%rsi)
17176 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 64(%rdx)
17177 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, (%rdx)
17178 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 64(%rcx)
17179 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, (%rcx)
17180 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 64(%r8)
17181 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm23, (%r8)
17182 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm24, 64(%r9)
17183 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm25, (%r9)
17184 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm26, 64(%r11)
17185 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm27, (%r11)
17186 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, 64(%r10)
17187 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%r10)
17188 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax)
17189 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax)
17190 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
17191 ; AVX512DQ-BW-FCP-NEXT: retq
17192 %wide.vec = load <512 x i16>, ptr %in.vec, align 64
17193 %strided.vec0 = shufflevector <512 x i16> %wide.vec, <512 x i16> poison, <64 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56, i32 64, i32 72, i32 80, i32 88, i32 96, i32 104, i32 112, i32 120, i32 128, i32 136, i32 144, i32 152, i32 160, i32 168, i32 176, i32 184, i32 192, i32 200, i32 208, i32 216, i32 224, i32 232, i32 240, i32 248, i32 256, i32 264, i32 272, i32 280, i32 288, i32 296, i32 304, i32 312, i32 320, i32 328, i32 336, i32 344, i32 352, i32 360, i32 368, i32 376, i32 384, i32 392, i32 400, i32 408, i32 416, i32 424, i32 432, i32 440, i32 448, i32 456, i32 464, i32 472, i32 480, i32 488, i32 496, i32 504>
17194 %strided.vec1 = shufflevector <512 x i16> %wide.vec, <512 x i16> poison, <64 x i32> <i32 1, i32 9, i32 17, i32 25, i32 33, i32 41, i32 49, i32 57, i32 65, i32 73, i32 81, i32 89, i32 97, i32 105, i32 113, i32 121, i32 129, i32 137, i32 145, i32 153, i32 161, i32 169, i32 177, i32 185, i32 193, i32 201, i32 209, i32 217, i32 225, i32 233, i32 241, i32 249, i32 257, i32 265, i32 273, i32 281, i32 289, i32 297, i32 305, i32 313, i32 321, i32 329, i32 337, i32 345, i32 353, i32 361, i32 369, i32 377, i32 385, i32 393, i32 401, i32 409, i32 417, i32 425, i32 433, i32 441, i32 449, i32 457, i32 465, i32 473, i32 481, i32 489, i32 497, i32 505>
17195 %strided.vec2 = shufflevector <512 x i16> %wide.vec, <512 x i16> poison, <64 x i32> <i32 2, i32 10, i32 18, i32 26, i32 34, i32 42, i32 50, i32 58, i32 66, i32 74, i32 82, i32 90, i32 98, i32 106, i32 114, i32 122, i32 130, i32 138, i32 146, i32 154, i32 162, i32 170, i32 178, i32 186, i32 194, i32 202, i32 210, i32 218, i32 226, i32 234, i32 242, i32 250, i32 258, i32 266, i32 274, i32 282, i32 290, i32 298, i32 306, i32 314, i32 322, i32 330, i32 338, i32 346, i32 354, i32 362, i32 370, i32 378, i32 386, i32 394, i32 402, i32 410, i32 418, i32 426, i32 434, i32 442, i32 450, i32 458, i32 466, i32 474, i32 482, i32 490, i32 498, i32 506>
17196 %strided.vec3 = shufflevector <512 x i16> %wide.vec, <512 x i16> poison, <64 x i32> <i32 3, i32 11, i32 19, i32 27, i32 35, i32 43, i32 51, i32 59, i32 67, i32 75, i32 83, i32 91, i32 99, i32 107, i32 115, i32 123, i32 131, i32 139, i32 147, i32 155, i32 163, i32 171, i32 179, i32 187, i32 195, i32 203, i32 211, i32 219, i32 227, i32 235, i32 243, i32 251, i32 259, i32 267, i32 275, i32 283, i32 291, i32 299, i32 307, i32 315, i32 323, i32 331, i32 339, i32 347, i32 355, i32 363, i32 371, i32 379, i32 387, i32 395, i32 403, i32 411, i32 419, i32 427, i32 435, i32 443, i32 451, i32 459, i32 467, i32 475, i32 483, i32 491, i32 499, i32 507>
17197 %strided.vec4 = shufflevector <512 x i16> %wide.vec, <512 x i16> poison, <64 x i32> <i32 4, i32 12, i32 20, i32 28, i32 36, i32 44, i32 52, i32 60, i32 68, i32 76, i32 84, i32 92, i32 100, i32 108, i32 116, i32 124, i32 132, i32 140, i32 148, i32 156, i32 164, i32 172, i32 180, i32 188, i32 196, i32 204, i32 212, i32 220, i32 228, i32 236, i32 244, i32 252, i32 260, i32 268, i32 276, i32 284, i32 292, i32 300, i32 308, i32 316, i32 324, i32 332, i32 340, i32 348, i32 356, i32 364, i32 372, i32 380, i32 388, i32 396, i32 404, i32 412, i32 420, i32 428, i32 436, i32 444, i32 452, i32 460, i32 468, i32 476, i32 484, i32 492, i32 500, i32 508>
17198 %strided.vec5 = shufflevector <512 x i16> %wide.vec, <512 x i16> poison, <64 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125, i32 133, i32 141, i32 149, i32 157, i32 165, i32 173, i32 181, i32 189, i32 197, i32 205, i32 213, i32 221, i32 229, i32 237, i32 245, i32 253, i32 261, i32 269, i32 277, i32 285, i32 293, i32 301, i32 309, i32 317, i32 325, i32 333, i32 341, i32 349, i32 357, i32 365, i32 373, i32 381, i32 389, i32 397, i32 405, i32 413, i32 421, i32 429, i32 437, i32 445, i32 453, i32 461, i32 469, i32 477, i32 485, i32 493, i32 501, i32 509>
17199 %strided.vec6 = shufflevector <512 x i16> %wide.vec, <512 x i16> poison, <64 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126, i32 134, i32 142, i32 150, i32 158, i32 166, i32 174, i32 182, i32 190, i32 198, i32 206, i32 214, i32 222, i32 230, i32 238, i32 246, i32 254, i32 262, i32 270, i32 278, i32 286, i32 294, i32 302, i32 310, i32 318, i32 326, i32 334, i32 342, i32 350, i32 358, i32 366, i32 374, i32 382, i32 390, i32 398, i32 406, i32 414, i32 422, i32 430, i32 438, i32 446, i32 454, i32 462, i32 470, i32 478, i32 486, i32 494, i32 502, i32 510>
17200 %strided.vec7 = shufflevector <512 x i16> %wide.vec, <512 x i16> poison, <64 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127, i32 135, i32 143, i32 151, i32 159, i32 167, i32 175, i32 183, i32 191, i32 199, i32 207, i32 215, i32 223, i32 231, i32 239, i32 247, i32 255, i32 263, i32 271, i32 279, i32 287, i32 295, i32 303, i32 311, i32 319, i32 327, i32 335, i32 343, i32 351, i32 359, i32 367, i32 375, i32 383, i32 391, i32 399, i32 407, i32 415, i32 423, i32 431, i32 439, i32 447, i32 455, i32 463, i32 471, i32 479, i32 487, i32 495, i32 503, i32 511>
17201 store <64 x i16> %strided.vec0, ptr %out.vec0, align 64
17202 store <64 x i16> %strided.vec1, ptr %out.vec1, align 64
17203 store <64 x i16> %strided.vec2, ptr %out.vec2, align 64
17204 store <64 x i16> %strided.vec3, ptr %out.vec3, align 64
17205 store <64 x i16> %strided.vec4, ptr %out.vec4, align 64
17206 store <64 x i16> %strided.vec5, ptr %out.vec5, align 64
17207 store <64 x i16> %strided.vec6, ptr %out.vec6, align 64
17208 store <64 x i16> %strided.vec7, ptr %out.vec7, align 64