1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
16 ; These patterns are produced by LoopVectorizer for interleaved loads.
18 define void @load_i16_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
19 ; SSE-LABEL: load_i16_stride4_vf2:
21 ; SSE-NEXT: movdqa (%rdi), %xmm0
22 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
23 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
24 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
25 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
26 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7]
27 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
28 ; SSE-NEXT: movd %xmm2, (%rsi)
29 ; SSE-NEXT: movd %xmm1, (%rdx)
30 ; SSE-NEXT: movd %xmm3, (%rcx)
31 ; SSE-NEXT: movd %xmm0, (%r8)
34 ; AVX-LABEL: load_i16_stride4_vf2:
36 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
37 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
38 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
39 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
40 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
41 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7]
42 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
43 ; AVX-NEXT: vmovd %xmm2, (%rsi)
44 ; AVX-NEXT: vmovd %xmm1, (%rdx)
45 ; AVX-NEXT: vmovd %xmm3, (%rcx)
46 ; AVX-NEXT: vmovd %xmm0, (%r8)
49 ; AVX2-LABEL: load_i16_stride4_vf2:
51 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
52 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
53 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
54 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
55 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
56 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7]
57 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
58 ; AVX2-NEXT: vmovd %xmm2, (%rsi)
59 ; AVX2-NEXT: vmovd %xmm1, (%rdx)
60 ; AVX2-NEXT: vmovd %xmm3, (%rcx)
61 ; AVX2-NEXT: vmovd %xmm0, (%r8)
64 ; AVX2-FP-LABEL: load_i16_stride4_vf2:
66 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
67 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
68 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
69 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
70 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
71 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7]
72 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
73 ; AVX2-FP-NEXT: vmovd %xmm2, (%rsi)
74 ; AVX2-FP-NEXT: vmovd %xmm1, (%rdx)
75 ; AVX2-FP-NEXT: vmovd %xmm3, (%rcx)
76 ; AVX2-FP-NEXT: vmovd %xmm0, (%r8)
79 ; AVX2-FCP-LABEL: load_i16_stride4_vf2:
81 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
82 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
83 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
84 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
85 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
86 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7]
87 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
88 ; AVX2-FCP-NEXT: vmovd %xmm2, (%rsi)
89 ; AVX2-FCP-NEXT: vmovd %xmm1, (%rdx)
90 ; AVX2-FCP-NEXT: vmovd %xmm3, (%rcx)
91 ; AVX2-FCP-NEXT: vmovd %xmm0, (%r8)
94 ; AVX512-LABEL: load_i16_stride4_vf2:
96 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
97 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
98 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
99 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
100 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
101 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
102 ; AVX512-NEXT: vpmovqw %xmm0, (%rsi)
103 ; AVX512-NEXT: vmovd %xmm1, (%rdx)
104 ; AVX512-NEXT: vmovd %xmm3, (%rcx)
105 ; AVX512-NEXT: vmovd %xmm2, (%r8)
108 ; AVX512-FCP-LABEL: load_i16_stride4_vf2:
109 ; AVX512-FCP: # %bb.0:
110 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
111 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
112 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
113 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
114 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
115 ; AVX512-FCP-NEXT: vpmovqw %xmm0, (%rsi)
116 ; AVX512-FCP-NEXT: vmovd %xmm1, (%rdx)
117 ; AVX512-FCP-NEXT: vmovd %xmm3, (%rcx)
118 ; AVX512-FCP-NEXT: vmovd %xmm2, (%r8)
119 ; AVX512-FCP-NEXT: retq
121 ; AVX512DQ-LABEL: load_i16_stride4_vf2:
123 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
124 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
125 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
126 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
127 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
128 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
129 ; AVX512DQ-NEXT: vpmovqw %xmm0, (%rsi)
130 ; AVX512DQ-NEXT: vmovd %xmm1, (%rdx)
131 ; AVX512DQ-NEXT: vmovd %xmm3, (%rcx)
132 ; AVX512DQ-NEXT: vmovd %xmm2, (%r8)
133 ; AVX512DQ-NEXT: retq
135 ; AVX512DQ-FCP-LABEL: load_i16_stride4_vf2:
136 ; AVX512DQ-FCP: # %bb.0:
137 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
138 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
139 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
140 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
141 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
142 ; AVX512DQ-FCP-NEXT: vpmovqw %xmm0, (%rsi)
143 ; AVX512DQ-FCP-NEXT: vmovd %xmm1, (%rdx)
144 ; AVX512DQ-FCP-NEXT: vmovd %xmm3, (%rcx)
145 ; AVX512DQ-FCP-NEXT: vmovd %xmm2, (%r8)
146 ; AVX512DQ-FCP-NEXT: retq
148 ; AVX512BW-LABEL: load_i16_stride4_vf2:
150 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
151 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
152 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
153 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
154 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
155 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
156 ; AVX512BW-NEXT: vpmovqw %xmm0, (%rsi)
157 ; AVX512BW-NEXT: vmovd %xmm1, (%rdx)
158 ; AVX512BW-NEXT: vmovd %xmm3, (%rcx)
159 ; AVX512BW-NEXT: vmovd %xmm2, (%r8)
160 ; AVX512BW-NEXT: retq
162 ; AVX512BW-FCP-LABEL: load_i16_stride4_vf2:
163 ; AVX512BW-FCP: # %bb.0:
164 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
165 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
166 ; AVX512BW-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
167 ; AVX512BW-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
168 ; AVX512BW-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
169 ; AVX512BW-FCP-NEXT: vpmovqw %xmm0, (%rsi)
170 ; AVX512BW-FCP-NEXT: vmovd %xmm1, (%rdx)
171 ; AVX512BW-FCP-NEXT: vmovd %xmm3, (%rcx)
172 ; AVX512BW-FCP-NEXT: vmovd %xmm2, (%r8)
173 ; AVX512BW-FCP-NEXT: retq
175 ; AVX512DQ-BW-LABEL: load_i16_stride4_vf2:
176 ; AVX512DQ-BW: # %bb.0:
177 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
178 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
179 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
180 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
181 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
182 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
183 ; AVX512DQ-BW-NEXT: vpmovqw %xmm0, (%rsi)
184 ; AVX512DQ-BW-NEXT: vmovd %xmm1, (%rdx)
185 ; AVX512DQ-BW-NEXT: vmovd %xmm3, (%rcx)
186 ; AVX512DQ-BW-NEXT: vmovd %xmm2, (%r8)
187 ; AVX512DQ-BW-NEXT: retq
189 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride4_vf2:
190 ; AVX512DQ-BW-FCP: # %bb.0:
191 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
192 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
193 ; AVX512DQ-BW-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
194 ; AVX512DQ-BW-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
195 ; AVX512DQ-BW-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
196 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %xmm0, (%rsi)
197 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm1, (%rdx)
198 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm3, (%rcx)
199 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm2, (%r8)
200 ; AVX512DQ-BW-FCP-NEXT: retq
201 %wide.vec = load <8 x i16>, ptr %in.vec, align 64
202 %strided.vec0 = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <2 x i32> <i32 0, i32 4>
203 %strided.vec1 = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <2 x i32> <i32 1, i32 5>
204 %strided.vec2 = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <2 x i32> <i32 2, i32 6>
205 %strided.vec3 = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <2 x i32> <i32 3, i32 7>
206 store <2 x i16> %strided.vec0, ptr %out.vec0, align 64
207 store <2 x i16> %strided.vec1, ptr %out.vec1, align 64
208 store <2 x i16> %strided.vec2, ptr %out.vec2, align 64
209 store <2 x i16> %strided.vec3, ptr %out.vec3, align 64
213 define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
214 ; SSE-LABEL: load_i16_stride4_vf4:
216 ; SSE-NEXT: movdqa (%rdi), %xmm0
217 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
218 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
219 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
220 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
221 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,2,2,3,4,5,6,7]
222 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
223 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
224 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[1,3,2,3,4,5,6,7]
225 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
226 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
227 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7]
228 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
229 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,0,2,3,4,5,6,7]
230 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
231 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
232 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
233 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
234 ; SSE-NEXT: movq %xmm5, (%rsi)
235 ; SSE-NEXT: movq %xmm3, (%rdx)
236 ; SSE-NEXT: movq %xmm4, (%rcx)
237 ; SSE-NEXT: movq %xmm0, (%r8)
240 ; AVX-LABEL: load_i16_stride4_vf4:
242 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
243 ; AVX-NEXT: vmovdqa (%rdi), %xmm1
244 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm2
245 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7]
246 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7]
247 ; AVX-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
248 ; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
249 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
250 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
251 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3]
252 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
253 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
254 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
255 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7]
256 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
257 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7]
258 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
259 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
260 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
261 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
262 ; AVX-NEXT: vmovq %xmm0, (%rsi)
263 ; AVX-NEXT: vmovq %xmm3, (%rdx)
264 ; AVX-NEXT: vmovq %xmm4, (%rcx)
265 ; AVX-NEXT: vmovq %xmm1, (%r8)
268 ; AVX2-LABEL: load_i16_stride4_vf4:
270 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
271 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
272 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2
273 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7]
274 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7]
275 ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
276 ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
277 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
278 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
279 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3]
280 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
281 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
282 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
283 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7]
284 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
285 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7]
286 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
287 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
288 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
289 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
290 ; AVX2-NEXT: vmovq %xmm0, (%rsi)
291 ; AVX2-NEXT: vmovq %xmm3, (%rdx)
292 ; AVX2-NEXT: vmovq %xmm4, (%rcx)
293 ; AVX2-NEXT: vmovq %xmm1, (%r8)
296 ; AVX2-FP-LABEL: load_i16_stride4_vf4:
298 ; AVX2-FP-NEXT: vpxor %xmm0, %xmm0, %xmm0
299 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1
300 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm2
301 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7]
302 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7]
303 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
304 ; AVX2-FP-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
305 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
306 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm4
307 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm3
308 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
309 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
310 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7]
311 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
312 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7]
313 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
314 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
315 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
316 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
317 ; AVX2-FP-NEXT: vmovq %xmm0, (%rsi)
318 ; AVX2-FP-NEXT: vmovq %xmm3, (%rdx)
319 ; AVX2-FP-NEXT: vmovq %xmm4, (%rcx)
320 ; AVX2-FP-NEXT: vmovq %xmm1, (%r8)
323 ; AVX2-FCP-LABEL: load_i16_stride4_vf4:
325 ; AVX2-FCP-NEXT: vpxor %xmm0, %xmm0, %xmm0
326 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1
327 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
328 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7]
329 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7]
330 ; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
331 ; AVX2-FCP-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
332 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
333 ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm4
334 ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm3
335 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
336 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
337 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7]
338 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
339 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7]
340 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
341 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
342 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
343 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
344 ; AVX2-FCP-NEXT: vmovq %xmm0, (%rsi)
345 ; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx)
346 ; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx)
347 ; AVX2-FCP-NEXT: vmovq %xmm1, (%r8)
348 ; AVX2-FCP-NEXT: retq
350 ; AVX512-LABEL: load_i16_stride4_vf4:
352 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
353 ; AVX512-NEXT: vpsrlq $16, %ymm0, %ymm1
354 ; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm2
355 ; AVX512-NEXT: vpsrlq $48, %ymm0, %ymm3
356 ; AVX512-NEXT: vpmovqw %ymm0, (%rsi)
357 ; AVX512-NEXT: vpmovqw %ymm1, (%rdx)
358 ; AVX512-NEXT: vpmovqw %ymm2, (%rcx)
359 ; AVX512-NEXT: vpmovqw %ymm3, (%r8)
360 ; AVX512-NEXT: vzeroupper
363 ; AVX512-FCP-LABEL: load_i16_stride4_vf4:
364 ; AVX512-FCP: # %bb.0:
365 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0
366 ; AVX512-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1
367 ; AVX512-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2
368 ; AVX512-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3
369 ; AVX512-FCP-NEXT: vpmovqw %ymm0, (%rsi)
370 ; AVX512-FCP-NEXT: vpmovqw %ymm1, (%rdx)
371 ; AVX512-FCP-NEXT: vpmovqw %ymm2, (%rcx)
372 ; AVX512-FCP-NEXT: vpmovqw %ymm3, (%r8)
373 ; AVX512-FCP-NEXT: vzeroupper
374 ; AVX512-FCP-NEXT: retq
376 ; AVX512DQ-LABEL: load_i16_stride4_vf4:
378 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
379 ; AVX512DQ-NEXT: vpsrlq $16, %ymm0, %ymm1
380 ; AVX512DQ-NEXT: vpsrlq $32, %ymm0, %ymm2
381 ; AVX512DQ-NEXT: vpsrlq $48, %ymm0, %ymm3
382 ; AVX512DQ-NEXT: vpmovqw %ymm0, (%rsi)
383 ; AVX512DQ-NEXT: vpmovqw %ymm1, (%rdx)
384 ; AVX512DQ-NEXT: vpmovqw %ymm2, (%rcx)
385 ; AVX512DQ-NEXT: vpmovqw %ymm3, (%r8)
386 ; AVX512DQ-NEXT: vzeroupper
387 ; AVX512DQ-NEXT: retq
389 ; AVX512DQ-FCP-LABEL: load_i16_stride4_vf4:
390 ; AVX512DQ-FCP: # %bb.0:
391 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0
392 ; AVX512DQ-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1
393 ; AVX512DQ-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2
394 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3
395 ; AVX512DQ-FCP-NEXT: vpmovqw %ymm0, (%rsi)
396 ; AVX512DQ-FCP-NEXT: vpmovqw %ymm1, (%rdx)
397 ; AVX512DQ-FCP-NEXT: vpmovqw %ymm2, (%rcx)
398 ; AVX512DQ-FCP-NEXT: vpmovqw %ymm3, (%r8)
399 ; AVX512DQ-FCP-NEXT: vzeroupper
400 ; AVX512DQ-FCP-NEXT: retq
402 ; AVX512BW-LABEL: load_i16_stride4_vf4:
404 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
405 ; AVX512BW-NEXT: vpsrlq $16, %ymm0, %ymm1
406 ; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2
407 ; AVX512BW-NEXT: vpsrlq $48, %ymm0, %ymm3
408 ; AVX512BW-NEXT: vpmovqw %ymm0, (%rsi)
409 ; AVX512BW-NEXT: vpmovqw %ymm1, (%rdx)
410 ; AVX512BW-NEXT: vpmovqw %ymm2, (%rcx)
411 ; AVX512BW-NEXT: vpmovqw %ymm3, (%r8)
412 ; AVX512BW-NEXT: vzeroupper
413 ; AVX512BW-NEXT: retq
415 ; AVX512BW-FCP-LABEL: load_i16_stride4_vf4:
416 ; AVX512BW-FCP: # %bb.0:
417 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
418 ; AVX512BW-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1
419 ; AVX512BW-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2
420 ; AVX512BW-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3
421 ; AVX512BW-FCP-NEXT: vpmovqw %ymm0, (%rsi)
422 ; AVX512BW-FCP-NEXT: vpmovqw %ymm1, (%rdx)
423 ; AVX512BW-FCP-NEXT: vpmovqw %ymm2, (%rcx)
424 ; AVX512BW-FCP-NEXT: vpmovqw %ymm3, (%r8)
425 ; AVX512BW-FCP-NEXT: vzeroupper
426 ; AVX512BW-FCP-NEXT: retq
428 ; AVX512DQ-BW-LABEL: load_i16_stride4_vf4:
429 ; AVX512DQ-BW: # %bb.0:
430 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0
431 ; AVX512DQ-BW-NEXT: vpsrlq $16, %ymm0, %ymm1
432 ; AVX512DQ-BW-NEXT: vpsrlq $32, %ymm0, %ymm2
433 ; AVX512DQ-BW-NEXT: vpsrlq $48, %ymm0, %ymm3
434 ; AVX512DQ-BW-NEXT: vpmovqw %ymm0, (%rsi)
435 ; AVX512DQ-BW-NEXT: vpmovqw %ymm1, (%rdx)
436 ; AVX512DQ-BW-NEXT: vpmovqw %ymm2, (%rcx)
437 ; AVX512DQ-BW-NEXT: vpmovqw %ymm3, (%r8)
438 ; AVX512DQ-BW-NEXT: vzeroupper
439 ; AVX512DQ-BW-NEXT: retq
441 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride4_vf4:
442 ; AVX512DQ-BW-FCP: # %bb.0:
443 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
444 ; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1
445 ; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2
446 ; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3
447 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm0, (%rsi)
448 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm1, (%rdx)
449 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm2, (%rcx)
450 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm3, (%r8)
451 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
452 ; AVX512DQ-BW-FCP-NEXT: retq
453 %wide.vec = load <16 x i16>, ptr %in.vec, align 64
454 %strided.vec0 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
455 %strided.vec1 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
456 %strided.vec2 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
457 %strided.vec3 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
458 store <4 x i16> %strided.vec0, ptr %out.vec0, align 64
459 store <4 x i16> %strided.vec1, ptr %out.vec1, align 64
460 store <4 x i16> %strided.vec2, ptr %out.vec2, align 64
461 store <4 x i16> %strided.vec3, ptr %out.vec3, align 64
465 define void @load_i16_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
466 ; SSE-LABEL: load_i16_stride4_vf8:
468 ; SSE-NEXT: movdqa (%rdi), %xmm2
469 ; SSE-NEXT: movdqa 16(%rdi), %xmm3
470 ; SSE-NEXT: movdqa 32(%rdi), %xmm1
471 ; SSE-NEXT: movdqa 48(%rdi), %xmm4
472 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,2,2,3]
473 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7]
474 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,2,2,3]
475 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[0,2,2,3,4,5,6,7]
476 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
477 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,2,2,3]
478 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm8[0,1,0,2,4,5,6,7]
479 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,2,2,3]
480 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,0,2,4,5,6,7]
481 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
482 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
483 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
484 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
485 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
486 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,3,4,5,6,7]
487 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm10[0,1,1,3,4,5,6,7]
488 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
489 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1]
490 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
491 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,0,2,3,4,5,6,7]
492 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
493 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[2,0,2,3,4,5,6,7]
494 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
495 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
496 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,1,2,0,4,5,6,7]
497 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
498 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,1,2,0,4,5,6,7]
499 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
500 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm6[0],xmm8[1]
501 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
502 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
503 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
504 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,1,3,1,4,5,6,7]
505 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
506 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
507 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
508 ; SSE-NEXT: movapd %xmm0, (%rsi)
509 ; SSE-NEXT: movapd %xmm7, (%rdx)
510 ; SSE-NEXT: movapd %xmm8, (%rcx)
511 ; SSE-NEXT: movapd %xmm1, (%r8)
514 ; AVX-LABEL: load_i16_stride4_vf8:
516 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
517 ; AVX-NEXT: vmovdqa (%rdi), %xmm1
518 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm2
519 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm3
520 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm4
521 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0],xmm0[1,2,3],xmm4[4],xmm0[5,6,7]
522 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm0[1,2,3],xmm3[4],xmm0[5,6,7]
523 ; AVX-NEXT: vpackusdw %xmm5, %xmm6, %xmm5
524 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7]
525 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7]
526 ; AVX-NEXT: vpackusdw %xmm6, %xmm0, %xmm0
527 ; AVX-NEXT: vpackusdw %xmm5, %xmm0, %xmm0
528 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,2,2,3]
529 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
530 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,2,2,3]
531 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
532 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
533 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,2,2,3]
534 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
535 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,2,2,3]
536 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
537 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
538 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
539 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
540 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,2,0,4,5,6,7]
541 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
542 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,1,2,0,4,5,6,7]
543 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
544 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
545 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7]
546 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
547 ; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7]
548 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
549 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7]
550 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
551 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
552 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
553 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
554 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
555 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
556 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
557 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
558 ; AVX-NEXT: vmovdqa %xmm5, (%rdx)
559 ; AVX-NEXT: vmovdqa %xmm6, (%rcx)
560 ; AVX-NEXT: vmovdqa %xmm1, (%r8)
563 ; AVX2-LABEL: load_i16_stride4_vf8:
565 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
566 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
567 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
568 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
569 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
570 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
571 ; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
572 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
573 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
574 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2
575 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm3
576 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm4
577 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,2,2,3]
578 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
579 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,2,2,3]
580 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
581 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
582 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,2,2,3]
583 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
584 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,2,2,3]
585 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
586 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
587 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
588 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
589 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,2,0,4,5,6,7]
590 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
591 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,1,2,0,4,5,6,7]
592 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
593 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
594 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7]
595 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
596 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7]
597 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
598 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
599 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
600 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
601 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
602 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
603 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
604 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
605 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
606 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
607 ; AVX2-NEXT: vmovdqa %xmm5, (%rdx)
608 ; AVX2-NEXT: vmovdqa %xmm6, (%rcx)
609 ; AVX2-NEXT: vmovdqa %xmm1, (%r8)
610 ; AVX2-NEXT: vzeroupper
613 ; AVX2-FP-LABEL: load_i16_stride4_vf8:
615 ; AVX2-FP-NEXT: vpxor %xmm0, %xmm0, %xmm0
616 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
617 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
618 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
619 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
620 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2
621 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
622 ; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
623 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1
624 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm2
625 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm3
626 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm4
627 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
628 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm4, %xmm6
629 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm5
630 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
631 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
632 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm7
633 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm6
634 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
635 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
636 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
637 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,2,0,4,5,6,7]
638 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
639 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,1,2,0,4,5,6,7]
640 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
641 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
642 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7]
643 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
644 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7]
645 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
646 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
647 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
648 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
649 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
650 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
651 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
652 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
653 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
654 ; AVX2-FP-NEXT: vmovdqa %xmm0, (%rsi)
655 ; AVX2-FP-NEXT: vmovdqa %xmm5, (%rdx)
656 ; AVX2-FP-NEXT: vmovdqa %xmm6, (%rcx)
657 ; AVX2-FP-NEXT: vmovdqa %xmm1, (%r8)
658 ; AVX2-FP-NEXT: vzeroupper
661 ; AVX2-FCP-LABEL: load_i16_stride4_vf8:
663 ; AVX2-FCP-NEXT: vpxor %xmm0, %xmm0, %xmm0
664 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
665 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
666 ; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
667 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
668 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
669 ; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
670 ; AVX2-FCP-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
671 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1
672 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
673 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
674 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm4
675 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
676 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm6
677 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm5
678 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
679 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
680 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm7
681 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6
682 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
683 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
684 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
685 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,2,0,4,5,6,7]
686 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
687 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,1,2,0,4,5,6,7]
688 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
689 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
690 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7]
691 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
692 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7]
693 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
694 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
695 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
696 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
697 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
698 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
699 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
700 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
701 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
702 ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rsi)
703 ; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rdx)
704 ; AVX2-FCP-NEXT: vmovdqa %xmm6, (%rcx)
705 ; AVX2-FCP-NEXT: vmovdqa %xmm1, (%r8)
706 ; AVX2-FCP-NEXT: vzeroupper
707 ; AVX2-FCP-NEXT: retq
709 ; AVX512-LABEL: load_i16_stride4_vf8:
711 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
712 ; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm1
713 ; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm2
714 ; AVX512-NEXT: vpsrlq $48, %zmm0, %zmm3
715 ; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
716 ; AVX512-NEXT: vpmovqw %zmm1, (%rdx)
717 ; AVX512-NEXT: vpmovqw %zmm2, (%rcx)
718 ; AVX512-NEXT: vpmovqw %zmm3, (%r8)
719 ; AVX512-NEXT: vzeroupper
722 ; AVX512-FCP-LABEL: load_i16_stride4_vf8:
723 ; AVX512-FCP: # %bb.0:
724 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
725 ; AVX512-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1
726 ; AVX512-FCP-NEXT: vpsrlq $32, %zmm0, %zmm2
727 ; AVX512-FCP-NEXT: vpsrlq $48, %zmm0, %zmm3
728 ; AVX512-FCP-NEXT: vpmovqw %zmm0, (%rsi)
729 ; AVX512-FCP-NEXT: vpmovqw %zmm1, (%rdx)
730 ; AVX512-FCP-NEXT: vpmovqw %zmm2, (%rcx)
731 ; AVX512-FCP-NEXT: vpmovqw %zmm3, (%r8)
732 ; AVX512-FCP-NEXT: vzeroupper
733 ; AVX512-FCP-NEXT: retq
735 ; AVX512DQ-LABEL: load_i16_stride4_vf8:
737 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
738 ; AVX512DQ-NEXT: vpsrlq $16, %zmm0, %zmm1
739 ; AVX512DQ-NEXT: vpsrlq $32, %zmm0, %zmm2
740 ; AVX512DQ-NEXT: vpsrlq $48, %zmm0, %zmm3
741 ; AVX512DQ-NEXT: vpmovqw %zmm0, (%rsi)
742 ; AVX512DQ-NEXT: vpmovqw %zmm1, (%rdx)
743 ; AVX512DQ-NEXT: vpmovqw %zmm2, (%rcx)
744 ; AVX512DQ-NEXT: vpmovqw %zmm3, (%r8)
745 ; AVX512DQ-NEXT: vzeroupper
746 ; AVX512DQ-NEXT: retq
748 ; AVX512DQ-FCP-LABEL: load_i16_stride4_vf8:
749 ; AVX512DQ-FCP: # %bb.0:
750 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
751 ; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1
752 ; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm0, %zmm2
753 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm0, %zmm3
754 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm0, (%rsi)
755 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm1, (%rdx)
756 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm2, (%rcx)
757 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, (%r8)
758 ; AVX512DQ-FCP-NEXT: vzeroupper
759 ; AVX512DQ-FCP-NEXT: retq
761 ; AVX512BW-LABEL: load_i16_stride4_vf8:
763 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
764 ; AVX512BW-NEXT: vpsrlq $16, %zmm0, %zmm1
765 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
766 ; AVX512BW-NEXT: vpsrlq $48, %zmm0, %zmm3
767 ; AVX512BW-NEXT: vpmovqw %zmm0, (%rsi)
768 ; AVX512BW-NEXT: vpmovqw %zmm1, (%rdx)
769 ; AVX512BW-NEXT: vpmovqw %zmm2, (%rcx)
770 ; AVX512BW-NEXT: vpmovqw %zmm3, (%r8)
771 ; AVX512BW-NEXT: vzeroupper
772 ; AVX512BW-NEXT: retq
774 ; AVX512BW-FCP-LABEL: load_i16_stride4_vf8:
775 ; AVX512BW-FCP: # %bb.0:
776 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
777 ; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1
778 ; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm2
779 ; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm3
780 ; AVX512BW-FCP-NEXT: vpmovqw %zmm0, (%rsi)
781 ; AVX512BW-FCP-NEXT: vpmovqw %zmm1, (%rdx)
782 ; AVX512BW-FCP-NEXT: vpmovqw %zmm2, (%rcx)
783 ; AVX512BW-FCP-NEXT: vpmovqw %zmm3, (%r8)
784 ; AVX512BW-FCP-NEXT: vzeroupper
785 ; AVX512BW-FCP-NEXT: retq
787 ; AVX512DQ-BW-LABEL: load_i16_stride4_vf8:
788 ; AVX512DQ-BW: # %bb.0:
789 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
790 ; AVX512DQ-BW-NEXT: vpsrlq $16, %zmm0, %zmm1
791 ; AVX512DQ-BW-NEXT: vpsrlq $32, %zmm0, %zmm2
792 ; AVX512DQ-BW-NEXT: vpsrlq $48, %zmm0, %zmm3
793 ; AVX512DQ-BW-NEXT: vpmovqw %zmm0, (%rsi)
794 ; AVX512DQ-BW-NEXT: vpmovqw %zmm1, (%rdx)
795 ; AVX512DQ-BW-NEXT: vpmovqw %zmm2, (%rcx)
796 ; AVX512DQ-BW-NEXT: vpmovqw %zmm3, (%r8)
797 ; AVX512DQ-BW-NEXT: vzeroupper
798 ; AVX512DQ-BW-NEXT: retq
800 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride4_vf8:
801 ; AVX512DQ-BW-FCP: # %bb.0:
802 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
803 ; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1
804 ; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm2
805 ; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm3
806 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %zmm0, (%rsi)
807 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %zmm1, (%rdx)
808 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %zmm2, (%rcx)
809 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %zmm3, (%r8)
810 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
811 ; AVX512DQ-BW-FCP-NEXT: retq
812 %wide.vec = load <32 x i16>, ptr %in.vec, align 64
813 %strided.vec0 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
814 %strided.vec1 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
815 %strided.vec2 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
816 %strided.vec3 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
817 store <8 x i16> %strided.vec0, ptr %out.vec0, align 64
818 store <8 x i16> %strided.vec1, ptr %out.vec1, align 64
819 store <8 x i16> %strided.vec2, ptr %out.vec2, align 64
820 store <8 x i16> %strided.vec3, ptr %out.vec3, align 64
824 define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
825 ; SSE-LABEL: load_i16_stride4_vf16:
827 ; SSE-NEXT: movdqa 96(%rdi), %xmm4
828 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
829 ; SSE-NEXT: movdqa 64(%rdi), %xmm1
830 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
831 ; SSE-NEXT: movdqa 80(%rdi), %xmm3
832 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
833 ; SSE-NEXT: movdqa (%rdi), %xmm8
834 ; SSE-NEXT: movdqa 16(%rdi), %xmm10
835 ; SSE-NEXT: movdqa 32(%rdi), %xmm7
836 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
837 ; SSE-NEXT: movdqa 48(%rdi), %xmm6
838 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,2,2,3]
839 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7]
840 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,2,2,3]
841 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,2,2,3,4,5,6,7]
842 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
843 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,2,2,3]
844 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm12[0,1,0,2,4,5,6,7]
845 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,2,2,3]
846 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm13[0,1,0,2,4,5,6,7]
847 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1]
848 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1]
849 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[0,2,2,3]
850 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,2,2,3,4,5,6,7]
851 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,2,2,3]
852 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,2,2,3,4,5,6,7]
853 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
854 ; SSE-NEXT: movdqa 112(%rdi), %xmm11
855 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3]
856 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,0,2,4,5,6,7]
857 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
858 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,0,2,4,5,6,7]
859 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
860 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
861 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,3,2,3,4,5,6,7]
862 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[1,3,2,3,4,5,6,7]
863 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
864 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,1,3,4,5,6,7]
865 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[0,1,1,3,4,5,6,7]
866 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
867 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1]
868 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[1,3,2,3,4,5,6,7]
869 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[1,3,2,3,4,5,6,7]
870 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
871 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7]
872 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm2[0,1,1,3,4,5,6,7]
873 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
874 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1]
875 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,1,2,3]
876 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,0,2,3,4,5,6,7]
877 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,1,2,3]
878 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
879 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
880 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,1,2,3]
881 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,1,2,0,4,5,6,7]
882 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
883 ; SSE-NEXT: # xmm10 = mem[3,1,2,3]
884 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,2,0,4,5,6,7]
885 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1]
886 ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1]
887 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
888 ; SSE-NEXT: # xmm3 = mem[3,1,2,3]
889 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[2,0,2,3,4,5,6,7]
890 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
891 ; SSE-NEXT: # xmm12 = mem[3,1,2,3]
892 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm12[2,0,2,3,4,5,6,7]
893 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1]
894 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[3,1,2,3]
895 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[0,1,2,0,4,5,6,7]
896 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
897 ; SSE-NEXT: # xmm14 = mem[3,1,2,3]
898 ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm14[0,1,2,0,4,5,6,7]
899 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1]
900 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm13[0],xmm15[1]
901 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
902 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
903 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
904 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,3,1,4,5,6,7]
905 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,1,3,1,4,5,6,7]
906 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
907 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
908 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,1,2,3,4,5,6,7]
909 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[3,1,2,3,4,5,6,7]
910 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
911 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,3,1,4,5,6,7]
912 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7]
913 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
914 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1]
915 ; SSE-NEXT: movapd %xmm4, 16(%rsi)
916 ; SSE-NEXT: movapd %xmm7, (%rsi)
917 ; SSE-NEXT: movapd %xmm9, 16(%rdx)
918 ; SSE-NEXT: movapd %xmm5, (%rdx)
919 ; SSE-NEXT: movapd %xmm15, 16(%rcx)
920 ; SSE-NEXT: movapd %xmm6, (%rcx)
921 ; SSE-NEXT: movapd %xmm3, 16(%r8)
922 ; SSE-NEXT: movapd %xmm1, (%r8)
925 ; AVX-LABEL: load_i16_stride4_vf16:
927 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
928 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm5
929 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm1[1,2,3],xmm5[4],xmm1[5,6,7]
930 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm6
931 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm1[1,2,3],xmm6[4],xmm1[5,6,7]
932 ; AVX-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
933 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm7
934 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm1[1,2,3],xmm7[4],xmm1[5,6,7]
935 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm8
936 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm1[1,2,3],xmm8[4],xmm1[5,6,7]
937 ; AVX-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
938 ; AVX-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
939 ; AVX-NEXT: vmovdqa (%rdi), %xmm2
940 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
941 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm4
942 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm9
943 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm9[0],xmm1[1,2,3],xmm9[4],xmm1[5,6,7]
944 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0],xmm1[1,2,3],xmm4[4],xmm1[5,6,7]
945 ; AVX-NEXT: vpackusdw %xmm10, %xmm11, %xmm10
946 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm1[1,2,3],xmm3[4],xmm1[5,6,7]
947 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7]
948 ; AVX-NEXT: vpackusdw %xmm11, %xmm1, %xmm1
949 ; AVX-NEXT: vpackusdw %xmm10, %xmm1, %xmm1
950 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[0,2,2,3]
951 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
952 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,2,2,3]
953 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7]
954 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
955 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,2,2,3]
956 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
957 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[0,2,2,3]
958 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
959 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
960 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7]
961 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
962 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[0,2,2,3]
963 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7]
964 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,2,2,3]
965 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7]
966 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
967 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[0,2,2,3]
968 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
969 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[0,2,2,3]
970 ; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,3,2,3,4,5,6,7]
971 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
972 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4,5,6,7]
973 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
974 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
975 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,1,2,0,4,5,6,7]
976 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
977 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[0,1,2,0,4,5,6,7]
978 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
979 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
980 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,0,2,3,4,5,6,7]
981 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3]
982 ; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[2,0,2,3,4,5,6,7]
983 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
984 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4,5,6,7]
985 ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
986 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3]
987 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[0,1,2,0,4,5,6,7]
988 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
989 ; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[0,1,2,0,4,5,6,7]
990 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
991 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
992 ; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,0,2,3,4,5,6,7]
993 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
994 ; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[2,0,2,3,4,5,6,7]
995 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
996 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3],xmm12[4,5,6,7]
997 ; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
998 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
999 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
1000 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1001 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7]
1002 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7]
1003 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
1004 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1005 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
1006 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[0,1,3,1,4,5,6,7]
1007 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
1008 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
1009 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
1010 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
1011 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1012 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
1013 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
1014 ; AVX-NEXT: vmovdqa %xmm1, (%rsi)
1015 ; AVX-NEXT: vmovdqa %xmm0, 16(%rsi)
1016 ; AVX-NEXT: vmovaps %ymm10, (%rdx)
1017 ; AVX-NEXT: vmovaps %ymm11, (%rcx)
1018 ; AVX-NEXT: vmovaps %ymm2, (%r8)
1019 ; AVX-NEXT: vzeroupper
1022 ; AVX2-LABEL: load_i16_stride4_vf16:
1024 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
1025 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
1026 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
1027 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1028 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
1029 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
1030 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1031 ; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
1032 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
1033 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
1034 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1035 ; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
1036 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
1037 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
1038 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
1039 ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
1040 ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
1041 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1042 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
1043 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1044 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
1045 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2
1046 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm3
1047 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm5
1048 ; AVX2-NEXT: vmovdqa 112(%rdi), %xmm6
1049 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3]
1050 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
1051 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm7
1052 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,2,3]
1053 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
1054 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
1055 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
1056 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm8
1057 ; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,2,2,3]
1058 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
1059 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm10
1060 ; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,2,2,3]
1061 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
1062 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
1063 ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
1064 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5],ymm4[6,7]
1065 ; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,2,2,3]
1066 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
1067 ; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,2,2,3]
1068 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7]
1069 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
1070 ; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,2,2,3]
1071 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
1072 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,2,2,3]
1073 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
1074 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
1075 ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3]
1076 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
1077 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
1078 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[0,1,2,0,4,5,6,7]
1079 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
1080 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[0,1,2,0,4,5,6,7]
1081 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
1082 ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
1083 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3]
1084 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[2,0,2,3,4,5,6,7]
1085 ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3]
1086 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[2,0,2,3,4,5,6,7]
1087 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
1088 ; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
1089 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
1090 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
1091 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,1,2,0,4,5,6,7]
1092 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
1093 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[0,1,2,0,4,5,6,7]
1094 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
1095 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
1096 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[2,0,2,3,4,5,6,7]
1097 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
1098 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,0,2,3,4,5,6,7]
1099 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
1100 ; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
1101 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
1102 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
1103 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7]
1104 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
1105 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
1106 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7]
1107 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[3,1,2,3,4,5,6,7]
1108 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
1109 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
1110 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
1111 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
1112 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
1113 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
1114 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
1115 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
1116 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1117 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
1118 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
1119 ; AVX2-NEXT: vmovdqa %ymm0, (%rsi)
1120 ; AVX2-NEXT: vmovdqa %ymm4, (%rdx)
1121 ; AVX2-NEXT: vmovdqa %ymm9, (%rcx)
1122 ; AVX2-NEXT: vmovdqa %ymm1, (%r8)
1123 ; AVX2-NEXT: vzeroupper
1126 ; AVX2-FP-LABEL: load_i16_stride4_vf16:
1128 ; AVX2-FP-NEXT: vpxor %xmm0, %xmm0, %xmm0
1129 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
1130 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
1131 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1132 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
1133 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
1134 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1135 ; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
1136 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
1137 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
1138 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1139 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
1140 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
1141 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
1142 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm3
1143 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
1144 ; AVX2-FP-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
1145 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1146 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
1147 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1148 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1
1149 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm3
1150 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm4
1151 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm5
1152 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
1153 ; AVX2-FP-NEXT: vmovdqa 112(%rdi), %xmm6
1154 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm6, %xmm7
1155 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm8
1156 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm8, %xmm9
1157 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
1158 ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
1159 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
1160 ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm10
1161 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm10, %xmm11
1162 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm12
1163 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm12, %xmm13
1164 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
1165 ; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
1166 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7]
1167 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm5, %xmm11
1168 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm4, %xmm2
1169 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1]
1170 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm3, %xmm11
1171 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm9
1172 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1]
1173 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
1174 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7]
1175 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
1176 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,2,0,4,5,6,7]
1177 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3]
1178 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,1,2,0,4,5,6,7]
1179 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
1180 ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
1181 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[3,1,2,3]
1182 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[2,0,2,3,4,5,6,7]
1183 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[3,1,2,3]
1184 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,0,2,3,4,5,6,7]
1185 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
1186 ; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
1187 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
1188 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
1189 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[0,1,2,0,4,5,6,7]
1190 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
1191 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[0,1,2,0,4,5,6,7]
1192 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
1193 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
1194 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[2,0,2,3,4,5,6,7]
1195 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
1196 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,0,2,3,4,5,6,7]
1197 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
1198 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
1199 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7]
1200 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
1201 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7]
1202 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
1203 ; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
1204 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[3,1,2,3,4,5,6,7]
1205 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm11[3,1,2,3,4,5,6,7]
1206 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
1207 ; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
1208 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
1209 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
1210 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
1211 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1212 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
1213 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
1214 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1215 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
1216 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
1217 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rsi)
1218 ; AVX2-FP-NEXT: vmovdqa %ymm2, (%rdx)
1219 ; AVX2-FP-NEXT: vmovdqa %ymm7, (%rcx)
1220 ; AVX2-FP-NEXT: vmovdqa %ymm1, (%r8)
1221 ; AVX2-FP-NEXT: vzeroupper
1222 ; AVX2-FP-NEXT: retq
1224 ; AVX2-FCP-LABEL: load_i16_stride4_vf16:
1225 ; AVX2-FCP: # %bb.0:
1226 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm1
1227 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm2
1228 ; AVX2-FCP-NEXT: vpxor %xmm0, %xmm0, %xmm0
1229 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
1230 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
1231 ; AVX2-FCP-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
1232 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
1233 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4
1234 ; AVX2-FCP-NEXT: vpackusdw %xmm4, %xmm0, %xmm0
1235 ; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
1236 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,2,3,0,2,4,6]
1237 ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm4
1238 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29]
1239 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm6
1240 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3
1241 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm7
1242 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
1243 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
1244 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm6
1245 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm7
1246 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm8
1247 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm9
1248 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
1249 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm11
1250 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm10
1251 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1]
1252 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
1253 ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm12
1254 ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm11
1255 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1]
1256 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
1257 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
1258 ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4
1259 ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
1260 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
1261 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7]
1262 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,2,3,1,3,5,7]
1263 ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm2
1264 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm10
1265 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm1
1266 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm4
1267 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm10[6,7]
1268 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[3,1,2,3]
1269 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[0,1,2,0,4,5,6,7]
1270 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3]
1271 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[0,1,2,0,4,5,6,7]
1272 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
1273 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
1274 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[2,0,2,3,4,5,6,7]
1275 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
1276 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[2,0,2,3,4,5,6,7]
1277 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
1278 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
1279 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
1280 ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2
1281 ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1
1282 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
1283 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,3,1,4,5,6,7]
1284 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,3,1,4,5,6,7]
1285 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
1286 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[3,1,2,3,4,5,6,7]
1287 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
1288 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1289 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
1290 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1291 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rsi)
1292 ; AVX2-FCP-NEXT: vmovdqa %ymm3, (%rdx)
1293 ; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rcx)
1294 ; AVX2-FCP-NEXT: vmovdqa %ymm1, (%r8)
1295 ; AVX2-FCP-NEXT: vzeroupper
1296 ; AVX2-FCP-NEXT: retq
1298 ; AVX512-LABEL: load_i16_stride4_vf16:
1300 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm0
1301 ; AVX512-NEXT: vpmovqw %ymm0, %xmm0
1302 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1303 ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm1
1304 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
1305 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,0,2,4,5,6,7]
1306 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm4
1307 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,2,2,3]
1308 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,0,2,4,5,6,7]
1309 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
1310 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
1311 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
1312 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm3
1313 ; AVX512-NEXT: vpmovqw %zmm3, %xmm6
1314 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
1315 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm6
1316 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm7
1317 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
1318 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
1319 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
1320 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
1321 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
1322 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
1323 ; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,2,2,3]
1324 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
1325 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
1326 ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
1327 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
1328 ; AVX512-NEXT: vpsrlq $16, %zmm3, %zmm5
1329 ; AVX512-NEXT: vpmovqw %zmm5, %xmm5
1330 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
1331 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
1332 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,1,2,0,4,5,6,7]
1333 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
1334 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,1,2,0,4,5,6,7]
1335 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
1336 ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
1337 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
1338 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,0,2,3,4,5,6,7]
1339 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
1340 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[2,0,2,3,4,5,6,7]
1341 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
1342 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
1343 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
1344 ; AVX512-NEXT: vpsrlq $32, %zmm3, %zmm8
1345 ; AVX512-NEXT: vpmovqw %zmm8, %xmm8
1346 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
1347 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
1348 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
1349 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1350 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
1351 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7]
1352 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
1353 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
1354 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
1355 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
1356 ; AVX512-NEXT: vpsrlq $48, %zmm3, %zmm3
1357 ; AVX512-NEXT: vpmovqw %zmm3, %xmm3
1358 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
1359 ; AVX512-NEXT: vmovdqa %ymm0, (%rsi)
1360 ; AVX512-NEXT: vmovdqa %ymm2, (%rdx)
1361 ; AVX512-NEXT: vmovdqa %ymm5, (%rcx)
1362 ; AVX512-NEXT: vmovdqa %ymm1, (%r8)
1363 ; AVX512-NEXT: vzeroupper
1366 ; AVX512-FCP-LABEL: load_i16_stride4_vf16:
1367 ; AVX512-FCP: # %bb.0:
1368 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
1369 ; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1]
1370 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6]
1371 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm2
1372 ; AVX512-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm3
1373 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm4
1374 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
1375 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
1376 ; AVX512-FCP-NEXT: vpermd %ymm6, %ymm1, %ymm1
1377 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm7
1378 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,2,2,3,4,6,12,14]
1379 ; AVX512-FCP-NEXT: vpermt2d %ymm4, %ymm8, %ymm7
1380 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
1381 ; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm9
1382 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
1383 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
1384 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
1385 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1
1386 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
1387 ; AVX512-FCP-NEXT: vpsrlq $16, %zmm4, %zmm3
1388 ; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3
1389 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
1390 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7]
1391 ; AVX512-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2
1392 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0
1393 ; AVX512-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm3
1394 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm5
1395 ; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm8, %ymm5
1396 ; AVX512-FCP-NEXT: vpsrlq $32, %zmm4, %zmm0
1397 ; AVX512-FCP-NEXT: vpmovqw %zmm0, %xmm0
1398 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
1399 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
1400 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
1401 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
1402 ; AVX512-FCP-NEXT: vpsrlq $48, %zmm4, %zmm3
1403 ; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3
1404 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
1405 ; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rsi)
1406 ; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rdx)
1407 ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rcx)
1408 ; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r8)
1409 ; AVX512-FCP-NEXT: vzeroupper
1410 ; AVX512-FCP-NEXT: retq
1412 ; AVX512DQ-LABEL: load_i16_stride4_vf16:
1413 ; AVX512DQ: # %bb.0:
1414 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm0
1415 ; AVX512DQ-NEXT: vpmovqw %ymm0, %xmm0
1416 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1417 ; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm1
1418 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
1419 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,0,2,4,5,6,7]
1420 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm4
1421 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,2,2,3]
1422 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,0,2,4,5,6,7]
1423 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
1424 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
1425 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
1426 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm3
1427 ; AVX512DQ-NEXT: vpmovqw %zmm3, %xmm6
1428 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
1429 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm6
1430 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm7
1431 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
1432 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
1433 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
1434 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
1435 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
1436 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
1437 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,2,2,3]
1438 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
1439 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
1440 ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
1441 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
1442 ; AVX512DQ-NEXT: vpsrlq $16, %zmm3, %zmm5
1443 ; AVX512DQ-NEXT: vpmovqw %zmm5, %xmm5
1444 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
1445 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
1446 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,1,2,0,4,5,6,7]
1447 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
1448 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,1,2,0,4,5,6,7]
1449 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
1450 ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
1451 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
1452 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,0,2,3,4,5,6,7]
1453 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
1454 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[2,0,2,3,4,5,6,7]
1455 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
1456 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
1457 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
1458 ; AVX512DQ-NEXT: vpsrlq $32, %zmm3, %zmm8
1459 ; AVX512DQ-NEXT: vpmovqw %zmm8, %xmm8
1460 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
1461 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
1462 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
1463 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1464 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
1465 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7]
1466 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
1467 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
1468 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
1469 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
1470 ; AVX512DQ-NEXT: vpsrlq $48, %zmm3, %zmm3
1471 ; AVX512DQ-NEXT: vpmovqw %zmm3, %xmm3
1472 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
1473 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rsi)
1474 ; AVX512DQ-NEXT: vmovdqa %ymm2, (%rdx)
1475 ; AVX512DQ-NEXT: vmovdqa %ymm5, (%rcx)
1476 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%r8)
1477 ; AVX512DQ-NEXT: vzeroupper
1478 ; AVX512DQ-NEXT: retq
1480 ; AVX512DQ-FCP-LABEL: load_i16_stride4_vf16:
1481 ; AVX512DQ-FCP: # %bb.0:
1482 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
1483 ; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1]
1484 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6]
1485 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm2
1486 ; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm3
1487 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm4
1488 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
1489 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
1490 ; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm1, %ymm1
1491 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm7
1492 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,2,2,3,4,6,12,14]
1493 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm4, %ymm8, %ymm7
1494 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
1495 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm9
1496 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
1497 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
1498 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
1499 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1
1500 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
1501 ; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm4, %zmm3
1502 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3
1503 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
1504 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7]
1505 ; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2
1506 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0
1507 ; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm3
1508 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm5
1509 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm8, %ymm5
1510 ; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm4, %zmm0
1511 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm0, %xmm0
1512 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
1513 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
1514 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
1515 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
1516 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm4, %zmm3
1517 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3
1518 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
1519 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rsi)
1520 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rdx)
1521 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rcx)
1522 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r8)
1523 ; AVX512DQ-FCP-NEXT: vzeroupper
1524 ; AVX512DQ-FCP-NEXT: retq
1526 ; AVX512BW-LABEL: load_i16_stride4_vf16:
1527 ; AVX512BW: # %bb.0:
1528 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
1529 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
1530 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2
1531 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm0
1532 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
1533 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3
1534 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
1535 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm4
1536 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
1537 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm5
1538 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
1539 ; AVX512BW-NEXT: vmovdqa %ymm3, (%rdx)
1540 ; AVX512BW-NEXT: vmovdqa %ymm4, (%rcx)
1541 ; AVX512BW-NEXT: vmovdqa %ymm5, (%r8)
1542 ; AVX512BW-NEXT: vzeroupper
1543 ; AVX512BW-NEXT: retq
1545 ; AVX512BW-FCP-LABEL: load_i16_stride4_vf16:
1546 ; AVX512BW-FCP: # %bb.0:
1547 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
1548 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
1549 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
1550 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm0
1551 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
1552 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm3
1553 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
1554 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm4
1555 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
1556 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm5
1557 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi)
1558 ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx)
1559 ; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rcx)
1560 ; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%r8)
1561 ; AVX512BW-FCP-NEXT: vzeroupper
1562 ; AVX512BW-FCP-NEXT: retq
1564 ; AVX512DQ-BW-LABEL: load_i16_stride4_vf16:
1565 ; AVX512DQ-BW: # %bb.0:
1566 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
1567 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1
1568 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2
1569 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm0
1570 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
1571 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3
1572 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
1573 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm4
1574 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
1575 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm5
1576 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rsi)
1577 ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rdx)
1578 ; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rcx)
1579 ; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%r8)
1580 ; AVX512DQ-BW-NEXT: vzeroupper
1581 ; AVX512DQ-BW-NEXT: retq
1583 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride4_vf16:
1584 ; AVX512DQ-BW-FCP: # %bb.0:
1585 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
1586 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
1587 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
1588 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm0
1589 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
1590 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm3
1591 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
1592 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm4
1593 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
1594 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm5
1595 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi)
1596 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx)
1597 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rcx)
1598 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%r8)
1599 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
1600 ; AVX512DQ-BW-FCP-NEXT: retq
1601 %wide.vec = load <64 x i16>, ptr %in.vec, align 64
1602 %strided.vec0 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
1603 %strided.vec1 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
1604 %strided.vec2 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
1605 %strided.vec3 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
1606 store <16 x i16> %strided.vec0, ptr %out.vec0, align 64
1607 store <16 x i16> %strided.vec1, ptr %out.vec1, align 64
1608 store <16 x i16> %strided.vec2, ptr %out.vec2, align 64
1609 store <16 x i16> %strided.vec3, ptr %out.vec3, align 64
1613 define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
1614 ; SSE-LABEL: load_i16_stride4_vf32:
1616 ; SSE-NEXT: subq $248, %rsp
1617 ; SSE-NEXT: movdqa 160(%rdi), %xmm3
1618 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1619 ; SSE-NEXT: movdqa 128(%rdi), %xmm4
1620 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1621 ; SSE-NEXT: movdqa 144(%rdi), %xmm5
1622 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1623 ; SSE-NEXT: movdqa 96(%rdi), %xmm2
1624 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1625 ; SSE-NEXT: movdqa 112(%rdi), %xmm6
1626 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1627 ; SSE-NEXT: movdqa 64(%rdi), %xmm1
1628 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1629 ; SSE-NEXT: movdqa 80(%rdi), %xmm0
1630 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1631 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1632 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1633 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1634 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1635 ; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
1636 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1637 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1638 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
1639 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1640 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1641 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[0,2,2,3]
1642 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,1,0,2,4,5,6,7]
1643 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1644 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
1645 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1646 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[0,2,2,3]
1647 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,2,2,3,4,5,6,7]
1648 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,2,2,3]
1649 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,2,2,3,4,5,6,7]
1650 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1651 ; SSE-NEXT: movdqa 176(%rdi), %xmm0
1652 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1653 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,2,2,3]
1654 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,0,2,4,5,6,7]
1655 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,2,2,3]
1656 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,1,0,2,4,5,6,7]
1657 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1658 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
1659 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1660 ; SSE-NEXT: movdqa (%rdi), %xmm1
1661 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1662 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
1663 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1664 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,2,2,3]
1665 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,2,2,3,4,5,6,7]
1666 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[0,2,2,3]
1667 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,2,2,3,4,5,6,7]
1668 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1669 ; SSE-NEXT: movdqa 32(%rdi), %xmm2
1670 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1671 ; SSE-NEXT: movdqa 48(%rdi), %xmm0
1672 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1673 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,2,2,3]
1674 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,0,2,4,5,6,7]
1675 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,2,2,3]
1676 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,0,2,4,5,6,7]
1677 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1678 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
1679 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1680 ; SSE-NEXT: movdqa 192(%rdi), %xmm0
1681 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1682 ; SSE-NEXT: movdqa 208(%rdi), %xmm1
1683 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1684 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3]
1685 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7]
1686 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
1687 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7]
1688 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1689 ; SSE-NEXT: movdqa 224(%rdi), %xmm2
1690 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1691 ; SSE-NEXT: movdqa 240(%rdi), %xmm1
1692 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1693 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
1694 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,0,2,4,5,6,7]
1695 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1696 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm2[0,1,0,2,4,5,6,7]
1697 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
1698 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1]
1699 ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1700 ; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1701 ; SSE-NEXT: # xmm0 = mem[1,3,2,3,4,5,6,7]
1702 ; SSE-NEXT: pshuflw $237, (%rsp), %xmm1 # 16-byte Folded Reload
1703 ; SSE-NEXT: # xmm1 = mem[1,3,2,3,4,5,6,7]
1704 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1705 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1706 ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7]
1707 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7]
1708 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
1709 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1]
1710 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1711 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[1,3,2,3,4,5,6,7]
1712 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[1,3,2,3,4,5,6,7]
1713 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1714 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,1,3,4,5,6,7]
1715 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
1716 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
1717 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1]
1718 ; SSE-NEXT: movapd %xmm9, (%rsp) # 16-byte Spill
1719 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,3,2,3,4,5,6,7]
1720 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[1,3,2,3,4,5,6,7]
1721 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1722 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7]
1723 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm6[0,1,1,3,4,5,6,7]
1724 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
1725 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1]
1726 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,3,2,3,4,5,6,7]
1727 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[1,3,2,3,4,5,6,7]
1728 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1729 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7]
1730 ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm2[0,1,1,3,4,5,6,7]
1731 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
1732 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1]
1733 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1734 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
1735 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1736 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1737 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
1738 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1739 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
1740 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
1741 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1742 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1743 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
1744 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1745 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
1746 ; SSE-NEXT: # xmm9 = mem[3,1,2,3]
1747 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
1748 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,2,0,4,5,6,7]
1749 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
1750 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1]
1751 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1752 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
1753 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1754 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1755 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
1756 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1757 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
1758 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
1759 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1760 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1761 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
1762 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1763 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
1764 ; SSE-NEXT: # xmm14 = mem[3,1,2,3]
1765 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
1766 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm14[0,1,2,0,4,5,6,7]
1767 ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
1768 ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1]
1769 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1770 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
1771 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1772 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1773 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
1774 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1775 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
1776 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7]
1777 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
1778 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
1779 ; SSE-NEXT: # xmm12 = mem[3,1,2,3]
1780 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
1781 ; SSE-NEXT: # xmm10 = mem[3,1,2,3]
1782 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7]
1783 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm10[0,1,2,0,4,5,6,7]
1784 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
1785 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
1786 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
1787 ; SSE-NEXT: # xmm8 = mem[3,1,2,3]
1788 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
1789 ; SSE-NEXT: # xmm7 = mem[3,1,2,3]
1790 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[2,0,2,3,4,5,6,7]
1791 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7]
1792 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1793 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
1794 ; SSE-NEXT: # xmm6 = mem[3,1,2,3]
1795 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
1796 ; SSE-NEXT: # xmm3 = mem[3,1,2,3]
1797 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,2,0,4,5,6,7]
1798 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,2,0,4,5,6,7]
1799 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1800 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1801 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1802 ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7]
1803 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1804 ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7]
1805 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1806 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1807 ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
1808 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,1,4,5,6,7]
1809 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
1810 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1]
1811 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1812 ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7]
1813 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1814 ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7]
1815 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1816 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1817 ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
1818 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[0,1,3,1,4,5,6,7]
1819 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
1820 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1]
1821 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1822 ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7]
1823 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1824 ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7]
1825 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1826 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,3,1,4,5,6,7]
1827 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7]
1828 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
1829 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm2[0],xmm10[1]
1830 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,1,2,3,4,5,6,7]
1831 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7]
1832 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1833 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,3,1,4,5,6,7]
1834 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
1835 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
1836 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1]
1837 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1838 ; SSE-NEXT: movaps %xmm0, 48(%rsi)
1839 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1840 ; SSE-NEXT: movaps %xmm0, (%rsi)
1841 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1842 ; SSE-NEXT: movaps %xmm0, 32(%rsi)
1843 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1844 ; SSE-NEXT: movaps %xmm0, 16(%rsi)
1845 ; SSE-NEXT: movapd %xmm15, 48(%rdx)
1846 ; SSE-NEXT: movapd %xmm13, (%rdx)
1847 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1848 ; SSE-NEXT: movaps %xmm0, 32(%rdx)
1849 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1850 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
1851 ; SSE-NEXT: movapd %xmm1, 48(%rcx)
1852 ; SSE-NEXT: movapd %xmm4, 32(%rcx)
1853 ; SSE-NEXT: movapd %xmm11, 16(%rcx)
1854 ; SSE-NEXT: movapd %xmm5, (%rcx)
1855 ; SSE-NEXT: movapd %xmm3, 48(%r8)
1856 ; SSE-NEXT: movapd %xmm10, 32(%r8)
1857 ; SSE-NEXT: movapd %xmm14, 16(%r8)
1858 ; SSE-NEXT: movapd %xmm9, (%r8)
1859 ; SSE-NEXT: addq $248, %rsp
1862 ; AVX-LABEL: load_i16_stride4_vf32:
1864 ; AVX-NEXT: subq $280, %rsp # imm = 0x118
1865 ; AVX-NEXT: vpxor %xmm6, %xmm6, %xmm6
1866 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm4
1867 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm6[1,2,3],xmm4[4],xmm6[5,6,7]
1868 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1869 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm11
1870 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0],xmm6[1,2,3],xmm11[4],xmm6[5,6,7]
1871 ; AVX-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
1872 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm3
1873 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3],xmm3[4],xmm6[5,6,7]
1874 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1875 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm0
1876 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1877 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7]
1878 ; AVX-NEXT: vpackusdw %xmm5, %xmm7, %xmm5
1879 ; AVX-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
1880 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1881 ; AVX-NEXT: vmovdqa (%rdi), %xmm10
1882 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm0
1883 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
1884 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
1885 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1886 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7]
1887 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7]
1888 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1889 ; AVX-NEXT: vpackusdw %xmm7, %xmm8, %xmm7
1890 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7]
1891 ; AVX-NEXT: vmovdqa %xmm0, %xmm2
1892 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1893 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm6[1,2,3],xmm10[4],xmm6[5,6,7]
1894 ; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1895 ; AVX-NEXT: vpackusdw %xmm8, %xmm9, %xmm8
1896 ; AVX-NEXT: vpackusdw %xmm7, %xmm8, %xmm7
1897 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1898 ; AVX-NEXT: vmovdqa 240(%rdi), %xmm0
1899 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1900 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7]
1901 ; AVX-NEXT: vmovdqa 224(%rdi), %xmm12
1902 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0],xmm6[1,2,3],xmm12[4],xmm6[5,6,7]
1903 ; AVX-NEXT: vmovdqa %xmm12, (%rsp) # 16-byte Spill
1904 ; AVX-NEXT: vpackusdw %xmm7, %xmm8, %xmm7
1905 ; AVX-NEXT: vmovdqa 208(%rdi), %xmm5
1906 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0],xmm6[1,2,3],xmm5[4],xmm6[5,6,7]
1907 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1908 ; AVX-NEXT: vmovdqa 192(%rdi), %xmm0
1909 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7]
1910 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1911 ; AVX-NEXT: vpackusdw %xmm8, %xmm9, %xmm8
1912 ; AVX-NEXT: vpackusdw %xmm7, %xmm8, %xmm7
1913 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1914 ; AVX-NEXT: vmovdqa 176(%rdi), %xmm14
1915 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0],xmm6[1,2,3],xmm14[4],xmm6[5,6,7]
1916 ; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1917 ; AVX-NEXT: vmovdqa 160(%rdi), %xmm13
1918 ; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0],xmm6[1,2,3],xmm13[4],xmm6[5,6,7]
1919 ; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1920 ; AVX-NEXT: vpackusdw %xmm7, %xmm15, %xmm7
1921 ; AVX-NEXT: vmovdqa 144(%rdi), %xmm9
1922 ; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm9[0],xmm6[1,2,3],xmm9[4],xmm6[5,6,7]
1923 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1924 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm8
1925 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1926 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1,2,3],xmm8[4],xmm6[5,6,7]
1927 ; AVX-NEXT: vpackusdw %xmm15, %xmm6, %xmm6
1928 ; AVX-NEXT: vpackusdw %xmm7, %xmm6, %xmm6
1929 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1930 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,2,2,3]
1931 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
1932 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3]
1933 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
1934 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
1935 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,2,2,3]
1936 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
1937 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1938 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[0,2,2,3]
1939 ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7]
1940 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1]
1941 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7]
1942 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
1943 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,2,2,3]
1944 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
1945 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[0,2,2,3]
1946 ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,1,3,4,5,6,7]
1947 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1]
1948 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[0,2,2,3]
1949 ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7]
1950 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[0,2,2,3]
1951 ; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
1952 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1]
1953 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4,5,6,7]
1954 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
1955 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
1956 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1957 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1958 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[0,2,2,3]
1959 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
1960 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,2,2,3]
1961 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
1962 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
1963 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,2,2,3]
1964 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
1965 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,2,2,3]
1966 ; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
1967 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
1968 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7]
1969 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[0,2,2,3]
1970 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
1971 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[0,2,2,3]
1972 ; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
1973 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
1974 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[0,2,2,3]
1975 ; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
1976 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
1977 ; AVX-NEXT: # xmm15 = mem[0,2,2,3]
1978 ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7]
1979 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1]
1980 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4,5,6,7]
1981 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
1982 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7]
1983 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1984 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1985 ; AVX-NEXT: # xmm0 = mem[3,1,2,3]
1986 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1987 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[3,1,2,3]
1988 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1989 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
1990 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
1991 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1992 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1993 ; AVX-NEXT: # xmm1 = mem[3,1,2,3]
1994 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1995 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3]
1996 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1997 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
1998 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
1999 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2000 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2001 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,1,2,3]
2002 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
2003 ; AVX-NEXT: # xmm6 = mem[3,1,2,3]
2004 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,2,0,4,5,6,7]
2005 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,2,0,4,5,6,7]
2006 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2007 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
2008 ; AVX-NEXT: # xmm12 = mem[3,1,2,3]
2009 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2010 ; AVX-NEXT: # xmm5 = mem[3,1,2,3]
2011 ; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[2,0,2,3,4,5,6,7]
2012 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[2,0,2,3,4,5,6,7]
2013 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
2014 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4,5,6,7]
2015 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2016 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2017 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2018 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[3,1,2,3]
2019 ; AVX-NEXT: vpshufd $231, (%rsp), %xmm14 # 16-byte Folded Reload
2020 ; AVX-NEXT: # xmm14 = mem[3,1,2,3]
2021 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,1,2,0,4,5,6,7]
2022 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,2,0,4,5,6,7]
2023 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2024 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
2025 ; AVX-NEXT: # xmm10 = mem[3,1,2,3]
2026 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
2027 ; AVX-NEXT: # xmm11 = mem[3,1,2,3]
2028 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,0,2,3,4,5,6,7]
2029 ; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,0,2,3,4,5,6,7]
2030 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
2031 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2032 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
2033 ; AVX-NEXT: # xmm8 = mem[3,1,2,3]
2034 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
2035 ; AVX-NEXT: # xmm9 = mem[3,1,2,3]
2036 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7]
2037 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,2,0,4,5,6,7]
2038 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2039 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
2040 ; AVX-NEXT: # xmm4 = mem[3,1,2,3]
2041 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2042 ; AVX-NEXT: # xmm2 = mem[3,1,2,3]
2043 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7]
2044 ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7]
2045 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
2046 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2047 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1
2048 ; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2049 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2050 ; AVX-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
2051 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2052 ; AVX-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
2053 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2054 ; AVX-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2055 ; AVX-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
2056 ; AVX-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
2057 ; AVX-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
2058 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2059 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2060 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,3,1,4,5,6,7]
2061 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7]
2062 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2063 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7]
2064 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
2065 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
2066 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
2067 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2068 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2069 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,3,1,4,5,6,7]
2070 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7]
2071 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2072 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[3,1,2,3,4,5,6,7]
2073 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7]
2074 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
2075 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
2076 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,1,3,1,4,5,6,7]
2077 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,3,1,4,5,6,7]
2078 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
2079 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
2080 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
2081 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2082 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
2083 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
2084 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2085 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2086 ; AVX-NEXT: vmovaps %xmm2, 32(%rsi)
2087 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2088 ; AVX-NEXT: vmovaps %xmm2, 48(%rsi)
2089 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2090 ; AVX-NEXT: vmovaps %xmm2, (%rsi)
2091 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2092 ; AVX-NEXT: vmovaps %xmm2, 16(%rsi)
2093 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2094 ; AVX-NEXT: vmovaps %ymm2, 32(%rdx)
2095 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2096 ; AVX-NEXT: vmovaps %ymm2, (%rdx)
2097 ; AVX-NEXT: vmovaps %ymm15, 32(%rcx)
2098 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2099 ; AVX-NEXT: vmovaps %ymm2, (%rcx)
2100 ; AVX-NEXT: vmovaps %ymm1, 32(%r8)
2101 ; AVX-NEXT: vmovaps %ymm0, (%r8)
2102 ; AVX-NEXT: addq $280, %rsp # imm = 0x118
2103 ; AVX-NEXT: vzeroupper
2106 ; AVX2-LABEL: load_i16_stride4_vf32:
2108 ; AVX2-NEXT: subq $168, %rsp
2109 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
2110 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2111 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2112 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2113 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2114 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2115 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2116 ; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
2117 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2118 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2119 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2120 ; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
2121 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
2122 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2123 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
2124 ; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
2125 ; AVX2-NEXT: vpackusdw %xmm3, %xmm3, %xmm3
2126 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
2127 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
2128 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
2129 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2130 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2131 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2132 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2133 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2134 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2135 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2136 ; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
2137 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2138 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2139 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2140 ; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
2141 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
2142 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2143 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
2144 ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
2145 ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
2146 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
2147 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
2148 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2149 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2150 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
2151 ; AVX2-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
2152 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm5
2153 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2154 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm6
2155 ; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2156 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm7
2157 ; AVX2-NEXT: vmovdqa 112(%rdi), %xmm9
2158 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,2,2,3]
2159 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
2160 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm13
2161 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,2,2,3]
2162 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
2163 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2164 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
2165 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm14
2166 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,2,2,3]
2167 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[1,3,2,3,4,5,6,7]
2168 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2
2169 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3]
2170 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
2171 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2172 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
2173 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
2174 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,2,2,3]
2175 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
2176 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3]
2177 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
2178 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2179 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
2180 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
2181 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,2,2,3]
2182 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
2183 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
2184 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
2185 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
2186 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2187 ; AVX2-NEXT: vmovdqa 240(%rdi), %xmm0
2188 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2189 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2190 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
2191 ; AVX2-NEXT: vmovdqa 224(%rdi), %xmm10
2192 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,2,2,3]
2193 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
2194 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
2195 ; AVX2-NEXT: vmovdqa 208(%rdi), %xmm11
2196 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,2,2,3]
2197 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
2198 ; AVX2-NEXT: vmovdqa 192(%rdi), %xmm15
2199 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,2,2,3]
2200 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
2201 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2202 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
2203 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
2204 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
2205 ; AVX2-NEXT: vmovdqa 176(%rdi), %xmm12
2206 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3]
2207 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
2208 ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm4
2209 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
2210 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
2211 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
2212 ; AVX2-NEXT: vmovdqa 144(%rdi), %xmm3
2213 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,2,2,3]
2214 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
2215 ; AVX2-NEXT: vmovdqa 128(%rdi), %xmm6
2216 ; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2217 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
2218 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
2219 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
2220 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
2221 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2222 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2223 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[3,1,2,3]
2224 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2225 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3]
2226 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2227 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
2228 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
2229 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2230 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[3,1,2,3]
2231 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2232 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
2233 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2234 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
2235 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,0,2,3,4,5,6,7]
2236 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
2237 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
2238 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
2239 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7]
2240 ; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[3,1,2,3]
2241 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
2242 ; AVX2-NEXT: # xmm9 = mem[3,1,2,3]
2243 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,2,0,4,5,6,7]
2244 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,2,0,4,5,6,7]
2245 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
2246 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
2247 ; AVX2-NEXT: # xmm7 = mem[3,1,2,3]
2248 ; AVX2-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload
2249 ; AVX2-NEXT: # xmm6 = mem[3,1,2,3]
2250 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7]
2251 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,0,2,3,4,5,6,7]
2252 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
2253 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2254 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
2255 ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
2256 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
2257 ; AVX2-NEXT: # xmm8 = mem[3,1,2,3]
2258 ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3]
2259 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,1,2,0,4,5,6,7]
2260 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7]
2261 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2262 ; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3]
2263 ; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3]
2264 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,0,2,3,4,5,6,7]
2265 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,0,2,3,4,5,6,7]
2266 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2267 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
2268 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
2269 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7]
2270 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3]
2271 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[3,1,2,3]
2272 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7]
2273 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7]
2274 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2275 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,1,2,3]
2276 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
2277 ; AVX2-NEXT: # xmm3 = mem[3,1,2,3]
2278 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7]
2279 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,0,2,3,4,5,6,7]
2280 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
2281 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
2282 ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2283 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2284 ; AVX2-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
2285 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2286 ; AVX2-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
2287 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2288 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2289 ; AVX2-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
2290 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2291 ; AVX2-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7]
2292 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2293 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
2294 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
2295 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
2296 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,3,1,4,5,6,7]
2297 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,3,1,4,5,6,7]
2298 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2299 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7]
2300 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
2301 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
2302 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2303 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2304 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,3,1,4,5,6,7]
2305 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,1,3,1,4,5,6,7]
2306 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2307 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[3,1,2,3,4,5,6,7]
2308 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[3,1,2,3,4,5,6,7]
2309 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
2310 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
2311 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
2312 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2313 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,1,4,5,6,7]
2314 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
2315 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
2316 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
2317 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
2318 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2319 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
2320 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2321 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2322 ; AVX2-NEXT: vmovaps %ymm2, 32(%rsi)
2323 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2324 ; AVX2-NEXT: vmovaps %ymm2, (%rsi)
2325 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2326 ; AVX2-NEXT: vmovaps %ymm2, 32(%rdx)
2327 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2328 ; AVX2-NEXT: vmovaps %ymm2, (%rdx)
2329 ; AVX2-NEXT: vmovdqa %ymm14, 32(%rcx)
2330 ; AVX2-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
2331 ; AVX2-NEXT: vmovaps %ymm2, (%rcx)
2332 ; AVX2-NEXT: vmovdqa %ymm1, 32(%r8)
2333 ; AVX2-NEXT: vmovdqa %ymm0, (%r8)
2334 ; AVX2-NEXT: addq $168, %rsp
2335 ; AVX2-NEXT: vzeroupper
2338 ; AVX2-FP-LABEL: load_i16_stride4_vf32:
2340 ; AVX2-FP-NEXT: subq $184, %rsp
2341 ; AVX2-FP-NEXT: vpxor %xmm0, %xmm0, %xmm0
2342 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2343 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
2344 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2345 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2346 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
2347 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2348 ; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
2349 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2350 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
2351 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2352 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
2353 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
2354 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2355 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
2356 ; AVX2-FP-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
2357 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm3, %xmm3
2358 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
2359 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
2360 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
2361 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2362 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2363 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
2364 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2365 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2366 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
2367 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2368 ; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
2369 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2370 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
2371 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2372 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
2373 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
2374 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2375 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm3
2376 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
2377 ; AVX2-FP-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
2378 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
2379 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
2380 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2381 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2382 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm13
2383 ; AVX2-FP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2384 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm12
2385 ; AVX2-FP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2386 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm14
2387 ; AVX2-FP-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2388 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm7
2389 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
2390 ; AVX2-FP-NEXT: vmovdqa 112(%rdi), %xmm2
2391 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm2, %xmm0
2392 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm4
2393 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm4, %xmm1
2394 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2395 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
2396 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
2397 ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm5
2398 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm5, %xmm8
2399 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm6
2400 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm6, %xmm9
2401 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
2402 ; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
2403 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7]
2404 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm7, %xmm8
2405 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm14, %xmm9
2406 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
2407 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm12, %xmm9
2408 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm13, %xmm12
2409 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
2410 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
2411 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7]
2412 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2413 ; AVX2-FP-NEXT: vmovdqa 240(%rdi), %xmm14
2414 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm14, %xmm9
2415 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %xmm8
2416 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm8, %xmm12
2417 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
2418 ; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm0
2419 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2420 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm0, %xmm12
2421 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm15
2422 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm15, %xmm13
2423 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
2424 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
2425 ; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
2426 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm9[6,7]
2427 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm0
2428 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2429 ; AVX2-FP-NEXT: vmovdqa 176(%rdi), %xmm13
2430 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm13, %xmm12
2431 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm0, %xmm10
2432 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1]
2433 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm3
2434 ; AVX2-FP-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill
2435 ; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm12
2436 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm12, %xmm0
2437 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm3, %xmm11
2438 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
2439 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3]
2440 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2441 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2442 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
2443 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2444 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,1,2,3]
2445 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2446 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
2447 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
2448 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2449 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,1,2,3]
2450 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2451 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,1,2,3]
2452 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2453 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
2454 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
2455 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2456 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
2457 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
2458 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7]
2459 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
2460 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
2461 ; AVX2-FP-NEXT: # xmm6 = mem[3,1,2,3]
2462 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,2,0,4,5,6,7]
2463 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,2,0,4,5,6,7]
2464 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2465 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2466 ; AVX2-FP-NEXT: # xmm5 = mem[3,1,2,3]
2467 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
2468 ; AVX2-FP-NEXT: # xmm4 = mem[3,1,2,3]
2469 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[2,0,2,3,4,5,6,7]
2470 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[2,0,2,3,4,5,6,7]
2471 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
2472 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2473 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
2474 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2475 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3]
2476 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3]
2477 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,1,2,0,4,5,6,7]
2478 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,1,2,0,4,5,6,7]
2479 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
2480 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
2481 ; AVX2-FP-NEXT: # xmm8 = mem[3,1,2,3]
2482 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[3,1,2,3]
2483 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[2,0,2,3,4,5,6,7]
2484 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[2,0,2,3,4,5,6,7]
2485 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1]
2486 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
2487 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
2488 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm0[6,7]
2489 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3]
2490 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
2491 ; AVX2-FP-NEXT: # xmm9 = mem[3,1,2,3]
2492 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[0,1,2,0,4,5,6,7]
2493 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,1,2,0,4,5,6,7]
2494 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1]
2495 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[3,1,2,3]
2496 ; AVX2-FP-NEXT: vpshufd $231, (%rsp), %xmm12 # 16-byte Folded Reload
2497 ; AVX2-FP-NEXT: # xmm12 = mem[3,1,2,3]
2498 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[2,0,2,3,4,5,6,7]
2499 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[2,0,2,3,4,5,6,7]
2500 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
2501 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3]
2502 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2503 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2504 ; AVX2-FP-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
2505 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
2506 ; AVX2-FP-NEXT: # xmm10 = mem[0,1,3,1,4,5,6,7]
2507 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1]
2508 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
2509 ; AVX2-FP-NEXT: # xmm10 = mem[3,1,2,3,4,5,6,7]
2510 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
2511 ; AVX2-FP-NEXT: # xmm11 = mem[3,1,2,3,4,5,6,7]
2512 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
2513 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
2514 ; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
2515 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7]
2516 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7]
2517 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
2518 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
2519 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
2520 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
2521 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
2522 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3]
2523 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
2524 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
2525 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
2526 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2527 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7]
2528 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[3,1,2,3,4,5,6,7]
2529 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2530 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
2531 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
2532 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
2533 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,3,1,4,5,6,7]
2534 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,3,1,4,5,6,7]
2535 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2536 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[3,1,2,3,4,5,6,7]
2537 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[3,1,2,3,4,5,6,7]
2538 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
2539 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
2540 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
2541 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2542 ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rsi)
2543 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2544 ; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi)
2545 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2546 ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rdx)
2547 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2548 ; AVX2-FP-NEXT: vmovaps %ymm3, (%rdx)
2549 ; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rcx)
2550 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2551 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rcx)
2552 ; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%r8)
2553 ; AVX2-FP-NEXT: vmovdqa %ymm1, (%r8)
2554 ; AVX2-FP-NEXT: addq $184, %rsp
2555 ; AVX2-FP-NEXT: vzeroupper
2556 ; AVX2-FP-NEXT: retq
2558 ; AVX2-FCP-LABEL: load_i16_stride4_vf32:
2559 ; AVX2-FCP: # %bb.0:
2560 ; AVX2-FCP-NEXT: subq $104, %rsp
2561 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm3
2562 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2563 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm6
2564 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2565 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm9
2566 ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2567 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm7
2568 ; AVX2-FCP-NEXT: vpxor %xmm0, %xmm0, %xmm0
2569 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2570 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
2571 ; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2572 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2573 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
2574 ; AVX2-FCP-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
2575 ; AVX2-FCP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
2576 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6]
2577 ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm5
2578 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29]
2579 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm8
2580 ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm2, %ymm10
2581 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm9
2582 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
2583 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
2584 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2585 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2586 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm8
2587 ; AVX2-FCP-NEXT: vpackusdw %xmm8, %xmm1, %xmm1
2588 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2589 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm8
2590 ; AVX2-FCP-NEXT: vpackusdw %xmm8, %xmm0, %xmm0
2591 ; AVX2-FCP-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2592 ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm9
2593 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm1
2594 ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm6
2595 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm8
2596 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
2597 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2598 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2599 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm14
2600 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
2601 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
2602 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm0
2603 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm8
2604 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
2605 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm13
2606 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
2607 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
2608 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm4
2609 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm11
2610 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm11[0],xmm4[0],xmm11[1],xmm4[1]
2611 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3]
2612 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
2613 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm5
2614 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm10, %ymm10
2615 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7]
2616 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7]
2617 ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
2618 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm11
2619 ; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm15
2620 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm15, %xmm4
2621 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm10
2622 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1]
2623 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm10
2624 ; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm12
2625 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm0
2626 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1
2627 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2628 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
2629 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm1
2630 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm4
2631 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
2632 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2633 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2634 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7]
2635 ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm6, %ymm4
2636 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
2637 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29]
2638 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm0
2639 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm1
2640 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
2641 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,1,2,3]
2642 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3]
2643 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,2,0,4,5,6,7]
2644 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,1,2,0,4,5,6,7]
2645 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
2646 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
2647 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3]
2648 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,0,2,3,4,5,6,7]
2649 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[2,0,2,3,4,5,6,7]
2650 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
2651 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
2652 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
2653 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2654 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload
2655 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload
2656 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm6
2657 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm9
2658 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
2659 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[3,1,2,3]
2660 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3]
2661 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[0,1,2,0,4,5,6,7]
2662 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,1,2,0,4,5,6,7]
2663 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1]
2664 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3]
2665 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3]
2666 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm12[2,0,2,3,4,5,6,7]
2667 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[2,0,2,3,4,5,6,7]
2668 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1]
2669 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
2670 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
2671 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
2672 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm4
2673 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm6
2674 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
2675 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
2676 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm14[0,1,3,1,4,5,6,7]
2677 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
2678 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
2679 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,1,2,3,4,5,6,7]
2680 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
2681 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2682 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
2683 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm2
2684 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm3
2685 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
2686 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,3,1,4,5,6,7]
2687 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7]
2688 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2689 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7]
2690 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[3,1,2,3,4,5,6,7]
2691 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
2692 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
2693 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
2694 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2695 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rsi)
2696 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2697 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi)
2698 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2699 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rdx)
2700 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload
2701 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rdx)
2702 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rcx)
2703 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2704 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rcx)
2705 ; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%r8)
2706 ; AVX2-FCP-NEXT: vmovdqa %ymm1, (%r8)
2707 ; AVX2-FCP-NEXT: addq $104, %rsp
2708 ; AVX2-FCP-NEXT: vzeroupper
2709 ; AVX2-FCP-NEXT: retq
2711 ; AVX512-LABEL: load_i16_stride4_vf32:
2713 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
2714 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1
2715 ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm2
2716 ; AVX512-NEXT: vpmovqw %ymm2, %xmm2
2717 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
2718 ; AVX512-NEXT: vmovdqa 240(%rdi), %xmm6
2719 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
2720 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,0,2,4,5,6,7]
2721 ; AVX512-NEXT: vmovdqa 224(%rdi), %xmm7
2722 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[0,2,2,3]
2723 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,0,2,4,5,6,7]
2724 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2725 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
2726 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
2727 ; AVX512-NEXT: vpmovqw %zmm1, %xmm3
2728 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
2729 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm3
2730 ; AVX512-NEXT: vpmovqw %ymm3, %xmm3
2731 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm10
2732 ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm3
2733 ; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,2,2,3]
2734 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,1,0,2,4,5,6,7]
2735 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm4
2736 ; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,2,2,3]
2737 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[0,1,0,2,4,5,6,7]
2738 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
2739 ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2740 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
2741 ; AVX512-NEXT: vpmovqw %zmm0, %xmm11
2742 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
2743 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm10[0,1,2,3],zmm2[0,1,2,3]
2744 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm10
2745 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm11
2746 ; AVX512-NEXT: vmovdqa 192(%rdi), %xmm13
2747 ; AVX512-NEXT: vmovdqa 208(%rdi), %xmm14
2748 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
2749 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
2750 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
2751 ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
2752 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3]
2753 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
2754 ; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[0,2,2,3]
2755 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7]
2756 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1]
2757 ; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
2758 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
2759 ; AVX512-NEXT: vpsrlq $16, %zmm1, %zmm9
2760 ; AVX512-NEXT: vpmovqw %zmm9, %xmm9
2761 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
2762 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
2763 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm12[0,1,1,3,4,5,6,7]
2764 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
2765 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
2766 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,2,2,3]
2767 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
2768 ; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,2,2,3]
2769 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
2770 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
2771 ; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
2772 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
2773 ; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm9
2774 ; AVX512-NEXT: vpmovqw %zmm9, %xmm9
2775 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
2776 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[0,1,2,3]
2777 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
2778 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,2,0,4,5,6,7]
2779 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
2780 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,2,0,4,5,6,7]
2781 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
2782 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm12
2783 ; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm14[3,1,2,3]
2784 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm8[2,0,2,3,4,5,6,7]
2785 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[3,1,2,3]
2786 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[2,0,2,3,4,5,6,7]
2787 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1]
2788 ; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
2789 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
2790 ; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm13
2791 ; AVX512-NEXT: vpmovqw %zmm13, %xmm13
2792 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
2793 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
2794 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[0,1,2,0,4,5,6,7]
2795 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
2796 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[0,1,2,0,4,5,6,7]
2797 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
2798 ; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
2799 ; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3]
2800 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm11[2,0,2,3,4,5,6,7]
2801 ; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3]
2802 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[2,0,2,3,4,5,6,7]
2803 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
2804 ; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
2805 ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
2806 ; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm14
2807 ; AVX512-NEXT: vpmovqw %zmm14, %xmm14
2808 ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
2809 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm13[0,1,2,3],zmm12[0,1,2,3]
2810 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
2811 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7]
2812 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
2813 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
2814 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7]
2815 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[3,1,2,3,4,5,6,7]
2816 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
2817 ; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
2818 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
2819 ; AVX512-NEXT: vpsrlq $48, %zmm1, %zmm1
2820 ; AVX512-NEXT: vpmovqw %zmm1, %xmm1
2821 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
2822 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
2823 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
2824 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2825 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
2826 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[3,1,2,3,4,5,6,7]
2827 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7]
2828 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
2829 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2830 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
2831 ; AVX512-NEXT: vpsrlq $48, %zmm0, %zmm0
2832 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
2833 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
2834 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,2,3]
2835 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rsi)
2836 ; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx)
2837 ; AVX512-NEXT: vmovdqa64 %zmm12, (%rcx)
2838 ; AVX512-NEXT: vmovdqa64 %zmm0, (%r8)
2839 ; AVX512-NEXT: vzeroupper
2842 ; AVX512-FCP-LABEL: load_i16_stride4_vf32:
2843 ; AVX512-FCP: # %bb.0:
2844 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
2845 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
2846 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
2847 ; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1]
2848 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6]
2849 ; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm5
2850 ; AVX512-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm6
2851 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm7
2852 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
2853 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm8
2854 ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm9
2855 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm10
2856 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,2,2,3,4,6,12,14]
2857 ; AVX512-FCP-NEXT: vpermt2d %ymm7, %ymm11, %ymm10
2858 ; AVX512-FCP-NEXT: vpmovqw %zmm1, %xmm7
2859 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
2860 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm10
2861 ; AVX512-FCP-NEXT: vpermd %ymm10, %ymm4, %ymm12
2862 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm12, %ymm13
2863 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm14
2864 ; AVX512-FCP-NEXT: vpermd %ymm14, %ymm4, %ymm15
2865 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm15, %ymm4
2866 ; AVX512-FCP-NEXT: vpermt2d %ymm13, %ymm11, %ymm4
2867 ; AVX512-FCP-NEXT: vpmovqw %zmm0, %xmm13
2868 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
2869 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm4[0,1,2,3],zmm7[0,1,2,3]
2870 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
2871 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6
2872 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm9
2873 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
2874 ; AVX512-FCP-NEXT: vpsrlq $16, %zmm1, %zmm9
2875 ; AVX512-FCP-NEXT: vpmovqw %zmm9, %xmm9
2876 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
2877 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm9
2878 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm15, %ymm12
2879 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7]
2880 ; AVX512-FCP-NEXT: vpsrlq $16, %zmm0, %zmm12
2881 ; AVX512-FCP-NEXT: vpmovqw %zmm12, %xmm12
2882 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
2883 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[0,1,2,3]
2884 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,2,3,1,3,5,7]
2885 ; AVX512-FCP-NEXT: vpermd %ymm5, %ymm9, %ymm5
2886 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm12
2887 ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8
2888 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm13
2889 ; AVX512-FCP-NEXT: vpermt2d %ymm12, %ymm11, %ymm13
2890 ; AVX512-FCP-NEXT: vpsrlq $32, %zmm1, %zmm12
2891 ; AVX512-FCP-NEXT: vpmovqw %zmm12, %xmm12
2892 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
2893 ; AVX512-FCP-NEXT: vpermd %ymm10, %ymm9, %ymm10
2894 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2
2895 ; AVX512-FCP-NEXT: vpermd %ymm14, %ymm9, %ymm9
2896 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm3
2897 ; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm11, %ymm3
2898 ; AVX512-FCP-NEXT: vpsrlq $32, %zmm0, %zmm2
2899 ; AVX512-FCP-NEXT: vpmovqw %zmm2, %xmm2
2900 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
2901 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[0,1,2,3]
2902 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm3
2903 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm5
2904 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
2905 ; AVX512-FCP-NEXT: vpsrlq $48, %zmm1, %zmm1
2906 ; AVX512-FCP-NEXT: vpmovqw %zmm1, %xmm1
2907 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
2908 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm3
2909 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4
2910 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
2911 ; AVX512-FCP-NEXT: vpsrlq $48, %zmm0, %zmm0
2912 ; AVX512-FCP-NEXT: vpmovqw %zmm0, %xmm0
2913 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
2914 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,2,3]
2915 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rsi)
2916 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rdx)
2917 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rcx)
2918 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
2919 ; AVX512-FCP-NEXT: vzeroupper
2920 ; AVX512-FCP-NEXT: retq
2922 ; AVX512DQ-LABEL: load_i16_stride4_vf32:
2923 ; AVX512DQ: # %bb.0:
2924 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
2925 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm1
2926 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm2
2927 ; AVX512DQ-NEXT: vpmovqw %ymm2, %xmm2
2928 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
2929 ; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm6
2930 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
2931 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,0,2,4,5,6,7]
2932 ; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm7
2933 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[0,2,2,3]
2934 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,0,2,4,5,6,7]
2935 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2936 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
2937 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
2938 ; AVX512DQ-NEXT: vpmovqw %zmm1, %xmm3
2939 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
2940 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm3
2941 ; AVX512DQ-NEXT: vpmovqw %ymm3, %xmm3
2942 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm10
2943 ; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm3
2944 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,2,2,3]
2945 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,1,0,2,4,5,6,7]
2946 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm4
2947 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,2,2,3]
2948 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[0,1,0,2,4,5,6,7]
2949 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
2950 ; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2951 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
2952 ; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm11
2953 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
2954 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm10[0,1,2,3],zmm2[0,1,2,3]
2955 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm10
2956 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm11
2957 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm13
2958 ; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm14
2959 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
2960 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
2961 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
2962 ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
2963 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3]
2964 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
2965 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[0,2,2,3]
2966 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7]
2967 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1]
2968 ; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
2969 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
2970 ; AVX512DQ-NEXT: vpsrlq $16, %zmm1, %zmm9
2971 ; AVX512DQ-NEXT: vpmovqw %zmm9, %xmm9
2972 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
2973 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
2974 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm12[0,1,1,3,4,5,6,7]
2975 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
2976 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
2977 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,2,2,3]
2978 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
2979 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,2,2,3]
2980 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
2981 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
2982 ; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
2983 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
2984 ; AVX512DQ-NEXT: vpsrlq $16, %zmm0, %zmm9
2985 ; AVX512DQ-NEXT: vpmovqw %zmm9, %xmm9
2986 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
2987 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[0,1,2,3]
2988 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
2989 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,2,0,4,5,6,7]
2990 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
2991 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,2,0,4,5,6,7]
2992 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
2993 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm12
2994 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm14[3,1,2,3]
2995 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm8[2,0,2,3,4,5,6,7]
2996 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[3,1,2,3]
2997 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[2,0,2,3,4,5,6,7]
2998 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1]
2999 ; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
3000 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
3001 ; AVX512DQ-NEXT: vpsrlq $32, %zmm1, %zmm13
3002 ; AVX512DQ-NEXT: vpmovqw %zmm13, %xmm13
3003 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
3004 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
3005 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[0,1,2,0,4,5,6,7]
3006 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
3007 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[0,1,2,0,4,5,6,7]
3008 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
3009 ; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
3010 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3]
3011 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm11[2,0,2,3,4,5,6,7]
3012 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3]
3013 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[2,0,2,3,4,5,6,7]
3014 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
3015 ; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
3016 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
3017 ; AVX512DQ-NEXT: vpsrlq $32, %zmm0, %zmm14
3018 ; AVX512DQ-NEXT: vpmovqw %zmm14, %xmm14
3019 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
3020 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm13[0,1,2,3],zmm12[0,1,2,3]
3021 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
3022 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7]
3023 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
3024 ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
3025 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7]
3026 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[3,1,2,3,4,5,6,7]
3027 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
3028 ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
3029 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
3030 ; AVX512DQ-NEXT: vpsrlq $48, %zmm1, %zmm1
3031 ; AVX512DQ-NEXT: vpmovqw %zmm1, %xmm1
3032 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
3033 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
3034 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
3035 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
3036 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
3037 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[3,1,2,3,4,5,6,7]
3038 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7]
3039 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
3040 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3041 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
3042 ; AVX512DQ-NEXT: vpsrlq $48, %zmm0, %zmm0
3043 ; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
3044 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
3045 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,2,3]
3046 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rsi)
3047 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rdx)
3048 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%rcx)
3049 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8)
3050 ; AVX512DQ-NEXT: vzeroupper
3051 ; AVX512DQ-NEXT: retq
3053 ; AVX512DQ-FCP-LABEL: load_i16_stride4_vf32:
3054 ; AVX512DQ-FCP: # %bb.0:
3055 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
3056 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
3057 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
3058 ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1]
3059 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6]
3060 ; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm5
3061 ; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm6
3062 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm7
3063 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
3064 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm8
3065 ; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm9
3066 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm10
3067 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,2,2,3,4,6,12,14]
3068 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm7, %ymm11, %ymm10
3069 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm1, %xmm7
3070 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
3071 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm10
3072 ; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm4, %ymm12
3073 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm12, %ymm13
3074 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm14
3075 ; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm4, %ymm15
3076 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm15, %ymm4
3077 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm13, %ymm11, %ymm4
3078 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm0, %xmm13
3079 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
3080 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm4[0,1,2,3],zmm7[0,1,2,3]
3081 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
3082 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6
3083 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm9
3084 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
3085 ; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm1, %zmm9
3086 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm9, %xmm9
3087 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
3088 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm9
3089 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm15, %ymm12
3090 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7]
3091 ; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm0, %zmm12
3092 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm12, %xmm12
3093 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
3094 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[0,1,2,3]
3095 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,2,3,1,3,5,7]
3096 ; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm9, %ymm5
3097 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm12
3098 ; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8
3099 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm13
3100 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm12, %ymm11, %ymm13
3101 ; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm1, %zmm12
3102 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm12, %xmm12
3103 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
3104 ; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm9, %ymm10
3105 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2
3106 ; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm9, %ymm9
3107 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm3
3108 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm11, %ymm3
3109 ; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm0, %zmm2
3110 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm2, %xmm2
3111 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
3112 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[0,1,2,3]
3113 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm3
3114 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm5
3115 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
3116 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm1, %zmm1
3117 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm1, %xmm1
3118 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
3119 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm3
3120 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4
3121 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
3122 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm0, %zmm0
3123 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm0, %xmm0
3124 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
3125 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,2,3]
3126 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rsi)
3127 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rdx)
3128 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rcx)
3129 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
3130 ; AVX512DQ-FCP-NEXT: vzeroupper
3131 ; AVX512DQ-FCP-NEXT: retq
3133 ; AVX512BW-LABEL: load_i16_stride4_vf32:
3134 ; AVX512BW: # %bb.0:
3135 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3136 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
3137 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2
3138 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3
3139 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
3140 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
3141 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5
3142 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm4, %zmm5
3143 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
3144 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
3145 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
3146 ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
3147 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6
3148 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm5, %zmm6
3149 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
3150 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
3151 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
3152 ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
3153 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7
3154 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm7
3155 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
3156 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7]
3157 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
3158 ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
3159 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm7, %zmm2
3160 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm7, %zmm0
3161 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
3162 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rsi)
3163 ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx)
3164 ; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rcx)
3165 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r8)
3166 ; AVX512BW-NEXT: vzeroupper
3167 ; AVX512BW-NEXT: retq
3169 ; AVX512BW-FCP-LABEL: load_i16_stride4_vf32:
3170 ; AVX512BW-FCP: # %bb.0:
3171 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
3172 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
3173 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
3174 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
3175 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
3176 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
3177 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
3178 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm4, %zmm5
3179 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
3180 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
3181 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
3182 ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
3183 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6
3184 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm5, %zmm6
3185 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
3186 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
3187 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
3188 ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
3189 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7
3190 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm7
3191 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
3192 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7]
3193 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
3194 ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
3195 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm7, %zmm2
3196 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm7, %zmm0
3197 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
3198 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi)
3199 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
3200 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rcx)
3201 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
3202 ; AVX512BW-FCP-NEXT: vzeroupper
3203 ; AVX512BW-FCP-NEXT: retq
3205 ; AVX512DQ-BW-LABEL: load_i16_stride4_vf32:
3206 ; AVX512DQ-BW: # %bb.0:
3207 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
3208 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
3209 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2
3210 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3
3211 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
3212 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
3213 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5
3214 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm4, %zmm5
3215 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
3216 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
3217 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
3218 ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
3219 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6
3220 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm5, %zmm6
3221 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
3222 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
3223 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
3224 ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
3225 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7
3226 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm7
3227 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
3228 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7]
3229 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
3230 ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
3231 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm7, %zmm2
3232 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm7, %zmm0
3233 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
3234 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rsi)
3235 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rdx)
3236 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rcx)
3237 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r8)
3238 ; AVX512DQ-BW-NEXT: vzeroupper
3239 ; AVX512DQ-BW-NEXT: retq
3241 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride4_vf32:
3242 ; AVX512DQ-BW-FCP: # %bb.0:
3243 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
3244 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
3245 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
3246 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
3247 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
3248 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
3249 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
3250 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm4, %zmm5
3251 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
3252 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
3253 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
3254 ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
3255 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6
3256 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm5, %zmm6
3257 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
3258 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
3259 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
3260 ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
3261 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7
3262 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm7
3263 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
3264 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7]
3265 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
3266 ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
3267 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm7, %zmm2
3268 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm7, %zmm0
3269 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
3270 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi)
3271 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
3272 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rcx)
3273 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
3274 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
3275 ; AVX512DQ-BW-FCP-NEXT: retq
3276 %wide.vec = load <128 x i16>, ptr %in.vec, align 64
3277 %strided.vec0 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 64, i32 68, i32 72, i32 76, i32 80, i32 84, i32 88, i32 92, i32 96, i32 100, i32 104, i32 108, i32 112, i32 116, i32 120, i32 124>
3278 %strided.vec1 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <32 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 65, i32 69, i32 73, i32 77, i32 81, i32 85, i32 89, i32 93, i32 97, i32 101, i32 105, i32 109, i32 113, i32 117, i32 121, i32 125>
3279 %strided.vec2 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <32 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 66, i32 70, i32 74, i32 78, i32 82, i32 86, i32 90, i32 94, i32 98, i32 102, i32 106, i32 110, i32 114, i32 118, i32 122, i32 126>
3280 %strided.vec3 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <32 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63, i32 67, i32 71, i32 75, i32 79, i32 83, i32 87, i32 91, i32 95, i32 99, i32 103, i32 107, i32 111, i32 115, i32 119, i32 123, i32 127>
3281 store <32 x i16> %strided.vec0, ptr %out.vec0, align 64
3282 store <32 x i16> %strided.vec1, ptr %out.vec1, align 64
3283 store <32 x i16> %strided.vec2, ptr %out.vec2, align 64
3284 store <32 x i16> %strided.vec3, ptr %out.vec3, align 64
3288 define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
3289 ; SSE-LABEL: load_i16_stride4_vf64:
3291 ; SSE-NEXT: subq $824, %rsp # imm = 0x338
3292 ; SSE-NEXT: movdqa 352(%rdi), %xmm3
3293 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3294 ; SSE-NEXT: movdqa 320(%rdi), %xmm4
3295 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3296 ; SSE-NEXT: movdqa 336(%rdi), %xmm5
3297 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3298 ; SSE-NEXT: movdqa 96(%rdi), %xmm2
3299 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3300 ; SSE-NEXT: movdqa 112(%rdi), %xmm6
3301 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3302 ; SSE-NEXT: movdqa 64(%rdi), %xmm1
3303 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3304 ; SSE-NEXT: movdqa 80(%rdi), %xmm0
3305 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3306 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3307 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3308 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3309 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3310 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3311 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3312 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3313 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
3314 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3315 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3316 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
3317 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3318 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
3319 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3320 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3321 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3322 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
3323 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3324 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3325 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
3326 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3327 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3328 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3329 ; SSE-NEXT: movdqa 368(%rdi), %xmm0
3330 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3331 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3332 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3333 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3334 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
3335 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3336 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
3337 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3338 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3339 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3340 ; SSE-NEXT: movdqa (%rdi), %xmm1
3341 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3342 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
3343 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3344 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3345 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3346 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3347 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3348 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3349 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3350 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3351 ; SSE-NEXT: movdqa 32(%rdi), %xmm2
3352 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3353 ; SSE-NEXT: movdqa 48(%rdi), %xmm0
3354 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3355 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3356 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3357 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3358 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
3359 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3360 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
3361 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3362 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3363 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3364 ; SSE-NEXT: movdqa 256(%rdi), %xmm1
3365 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3366 ; SSE-NEXT: movdqa 272(%rdi), %xmm0
3367 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3368 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3369 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3370 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3371 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3372 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3373 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3374 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3375 ; SSE-NEXT: movdqa 288(%rdi), %xmm2
3376 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3377 ; SSE-NEXT: movdqa 304(%rdi), %xmm0
3378 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3379 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3380 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3381 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3382 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
3383 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3384 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
3385 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3386 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3387 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3388 ; SSE-NEXT: movdqa 192(%rdi), %xmm1
3389 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3390 ; SSE-NEXT: movdqa 208(%rdi), %xmm0
3391 ; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
3392 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3393 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3394 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3395 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3396 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3397 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3398 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3399 ; SSE-NEXT: movdqa 224(%rdi), %xmm2
3400 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3401 ; SSE-NEXT: movdqa 240(%rdi), %xmm0
3402 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3403 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3404 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3405 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3406 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,2,2,3]
3407 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,1,0,2,4,5,6,7]
3408 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3409 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3410 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3411 ; SSE-NEXT: movdqa 448(%rdi), %xmm1
3412 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3413 ; SSE-NEXT: movdqa 464(%rdi), %xmm0
3414 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3415 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,2,2,3]
3416 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,2,2,3,4,5,6,7]
3417 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[0,2,2,3]
3418 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,2,2,3,4,5,6,7]
3419 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3420 ; SSE-NEXT: movdqa 480(%rdi), %xmm2
3421 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3422 ; SSE-NEXT: movdqa 496(%rdi), %xmm0
3423 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3424 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,2,2,3]
3425 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,0,2,4,5,6,7]
3426 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,2,2,3]
3427 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,1,0,2,4,5,6,7]
3428 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3429 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3430 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3431 ; SSE-NEXT: movdqa 128(%rdi), %xmm1
3432 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3433 ; SSE-NEXT: movdqa 144(%rdi), %xmm0
3434 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3435 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,2,2,3]
3436 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7]
3437 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,2,2,3]
3438 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7]
3439 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3440 ; SSE-NEXT: movdqa 160(%rdi), %xmm2
3441 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3442 ; SSE-NEXT: movdqa 176(%rdi), %xmm0
3443 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3444 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,2,2,3]
3445 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,0,2,4,5,6,7]
3446 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,2,2,3]
3447 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,0,2,4,5,6,7]
3448 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3449 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3450 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3451 ; SSE-NEXT: movdqa 384(%rdi), %xmm0
3452 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3453 ; SSE-NEXT: movdqa 400(%rdi), %xmm1
3454 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3455 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3]
3456 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7]
3457 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
3458 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7]
3459 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3460 ; SSE-NEXT: movdqa 416(%rdi), %xmm2
3461 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3462 ; SSE-NEXT: movdqa 432(%rdi), %xmm1
3463 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3464 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
3465 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,0,2,4,5,6,7]
3466 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
3467 ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm2[0,1,0,2,4,5,6,7]
3468 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1]
3469 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1]
3470 ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3471 ; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3472 ; SSE-NEXT: # xmm0 = mem[1,3,2,3,4,5,6,7]
3473 ; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3474 ; SSE-NEXT: # xmm1 = mem[1,3,2,3,4,5,6,7]
3475 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3476 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3477 ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7]
3478 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3479 ; SSE-NEXT: # xmm15 = mem[0,1,1,3,4,5,6,7]
3480 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
3481 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1]
3482 ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3483 ; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3484 ; SSE-NEXT: # xmm0 = mem[1,3,2,3,4,5,6,7]
3485 ; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3486 ; SSE-NEXT: # xmm1 = mem[1,3,2,3,4,5,6,7]
3487 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3488 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3489 ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7]
3490 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3491 ; SSE-NEXT: # xmm15 = mem[0,1,1,3,4,5,6,7]
3492 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
3493 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1]
3494 ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3495 ; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3496 ; SSE-NEXT: # xmm0 = mem[1,3,2,3,4,5,6,7]
3497 ; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3498 ; SSE-NEXT: # xmm1 = mem[1,3,2,3,4,5,6,7]
3499 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3500 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3501 ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7]
3502 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3503 ; SSE-NEXT: # xmm15 = mem[0,1,1,3,4,5,6,7]
3504 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
3505 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1]
3506 ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3507 ; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3508 ; SSE-NEXT: # xmm0 = mem[1,3,2,3,4,5,6,7]
3509 ; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3510 ; SSE-NEXT: # xmm1 = mem[1,3,2,3,4,5,6,7]
3511 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3512 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3513 ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7]
3514 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3515 ; SSE-NEXT: # xmm15 = mem[0,1,1,3,4,5,6,7]
3516 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
3517 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1]
3518 ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3519 ; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3520 ; SSE-NEXT: # xmm0 = mem[1,3,2,3,4,5,6,7]
3521 ; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3522 ; SSE-NEXT: # xmm1 = mem[1,3,2,3,4,5,6,7]
3523 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3524 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3525 ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7]
3526 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[0,1,1,3,4,5,6,7]
3527 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
3528 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1]
3529 ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3530 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,3,2,3,4,5,6,7]
3531 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[1,3,2,3,4,5,6,7]
3532 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3533 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,1,3,4,5,6,7]
3534 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
3535 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
3536 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1]
3537 ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3538 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,3,2,3,4,5,6,7]
3539 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[1,3,2,3,4,5,6,7]
3540 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3541 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7]
3542 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
3543 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
3544 ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1]
3545 ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3546 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,3,2,3,4,5,6,7]
3547 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[1,3,2,3,4,5,6,7]
3548 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3549 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7]
3550 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
3551 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3552 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3553 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3554 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3555 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
3556 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3557 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3558 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
3559 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3560 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
3561 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
3562 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3563 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3564 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
3565 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3566 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3567 ; SSE-NEXT: # xmm2 = mem[3,1,2,3]
3568 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3569 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
3570 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
3571 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3572 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3573 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3574 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3575 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
3576 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3577 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3578 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
3579 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3580 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
3581 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
3582 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3583 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3584 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
3585 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3586 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3587 ; SSE-NEXT: # xmm2 = mem[3,1,2,3]
3588 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3589 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
3590 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
3591 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3592 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3593 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3594 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3595 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
3596 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3597 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3598 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
3599 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3600 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
3601 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7]
3602 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3603 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
3604 ; SSE-NEXT: # xmm13 = mem[3,1,2,3]
3605 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
3606 ; SSE-NEXT: # xmm8 = mem[3,1,2,3]
3607 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,1,2,0,4,5,6,7]
3608 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7]
3609 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3610 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
3611 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3612 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3613 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
3614 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3615 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3616 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
3617 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3618 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
3619 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7]
3620 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3621 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3622 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
3623 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3624 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3625 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
3626 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3627 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
3628 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm1[0,1,2,0,4,5,6,7]
3629 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
3630 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1]
3631 ; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload
3632 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
3633 ; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
3634 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3635 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
3636 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3637 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
3638 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7]
3639 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3640 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3641 ; SSE-NEXT: # xmm15 = mem[3,1,2,3]
3642 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
3643 ; SSE-NEXT: # xmm12 = mem[3,1,2,3]
3644 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,1,2,0,4,5,6,7]
3645 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm12[0,1,2,0,4,5,6,7]
3646 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
3647 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm2[0],xmm10[1]
3648 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3649 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
3650 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3651 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3652 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
3653 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3654 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
3655 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7]
3656 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3657 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3658 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
3659 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3660 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3661 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
3662 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3663 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
3664 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[0,1,2,0,4,5,6,7]
3665 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
3666 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1]
3667 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
3668 ; SSE-NEXT: # xmm7 = mem[3,1,2,3]
3669 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
3670 ; SSE-NEXT: # xmm6 = mem[3,1,2,3]
3671 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7]
3672 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm6[2,0,2,3,4,5,6,7]
3673 ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
3674 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3675 ; SSE-NEXT: # xmm4 = mem[3,1,2,3]
3676 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3677 ; SSE-NEXT: # xmm3 = mem[3,1,2,3]
3678 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,2,0,4,5,6,7]
3679 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,1,2,0,4,5,6,7]
3680 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
3681 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm11[0],xmm5[1]
3682 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3683 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
3684 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3685 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3686 ; SSE-NEXT: # xmm2 = mem[3,1,2,3]
3687 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3688 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,0,2,3,4,5,6,7]
3689 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,0,2,3,4,5,6,7]
3690 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3691 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3692 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
3693 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3694 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
3695 ; SSE-NEXT: # xmm11 = mem[3,1,2,3]
3696 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
3697 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,1,2,0,4,5,6,7]
3698 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3699 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
3700 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3701 ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7]
3702 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3703 ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
3704 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3705 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,1,3,1,4,5,6,7]
3706 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,3,1,4,5,6,7]
3707 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
3708 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
3709 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3710 ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
3711 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
3712 ; SSE-NEXT: # xmm13 = mem[3,1,2,3,4,5,6,7]
3713 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
3714 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3715 ; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
3716 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
3717 ; SSE-NEXT: # xmm8 = mem[0,1,3,1,4,5,6,7]
3718 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
3719 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm13[0],xmm8[1]
3720 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7]
3721 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
3722 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
3723 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,1,3,1,4,5,6,7]
3724 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm3[0,1,3,1,4,5,6,7]
3725 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
3726 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm6[0],xmm13[1]
3727 ; SSE-NEXT: pshuflw $231, (%rsp), %xmm1 # 16-byte Folded Reload
3728 ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
3729 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3730 ; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
3731 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
3732 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[0,1,3,1,4,5,6,7]
3733 ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,3,1,4,5,6,7]
3734 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1]
3735 ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1]
3736 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3737 ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
3738 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3739 ; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
3740 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
3741 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3742 ; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
3743 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3744 ; SSE-NEXT: # xmm15 = mem[0,1,3,1,4,5,6,7]
3745 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1]
3746 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm3[0],xmm15[1]
3747 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3748 ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
3749 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3750 ; SSE-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7]
3751 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3752 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3753 ; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
3754 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3755 ; SSE-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7]
3756 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
3757 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
3758 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3759 ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
3760 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
3761 ; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7]
3762 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
3763 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3764 ; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
3765 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7]
3766 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3767 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1]
3768 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3769 ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
3770 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
3771 ; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7]
3772 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
3773 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3774 ; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
3775 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
3776 ; SSE-NEXT: # xmm7 = mem[0,1,3,1,4,5,6,7]
3777 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
3778 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1]
3779 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3780 ; SSE-NEXT: movaps %xmm1, 96(%rsi)
3781 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3782 ; SSE-NEXT: movaps %xmm1, 32(%rsi)
3783 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3784 ; SSE-NEXT: movaps %xmm1, 112(%rsi)
3785 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3786 ; SSE-NEXT: movaps %xmm1, 48(%rsi)
3787 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3788 ; SSE-NEXT: movaps %xmm1, 64(%rsi)
3789 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3790 ; SSE-NEXT: movaps %xmm1, (%rsi)
3791 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3792 ; SSE-NEXT: movaps %xmm1, 80(%rsi)
3793 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3794 ; SSE-NEXT: movaps %xmm1, 16(%rsi)
3795 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3796 ; SSE-NEXT: movaps %xmm1, 96(%rdx)
3797 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3798 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
3799 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3800 ; SSE-NEXT: movaps %xmm1, 112(%rdx)
3801 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3802 ; SSE-NEXT: movaps %xmm1, 48(%rdx)
3803 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3804 ; SSE-NEXT: movaps %xmm1, 64(%rdx)
3805 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3806 ; SSE-NEXT: movaps %xmm1, (%rdx)
3807 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3808 ; SSE-NEXT: movaps %xmm1, 80(%rdx)
3809 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3810 ; SSE-NEXT: movaps %xmm1, 16(%rdx)
3811 ; SSE-NEXT: movapd %xmm2, 96(%rcx)
3812 ; SSE-NEXT: movapd %xmm5, 32(%rcx)
3813 ; SSE-NEXT: movapd %xmm9, 112(%rcx)
3814 ; SSE-NEXT: movapd %xmm10, 48(%rcx)
3815 ; SSE-NEXT: movapd %xmm14, 64(%rcx)
3816 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3817 ; SSE-NEXT: movaps %xmm1, (%rcx)
3818 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3819 ; SSE-NEXT: movaps %xmm1, 80(%rcx)
3820 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3821 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
3822 ; SSE-NEXT: movapd %xmm7, 112(%r8)
3823 ; SSE-NEXT: movapd %xmm4, 96(%r8)
3824 ; SSE-NEXT: movapd %xmm3, 80(%r8)
3825 ; SSE-NEXT: movapd %xmm15, 64(%r8)
3826 ; SSE-NEXT: movapd %xmm12, 48(%r8)
3827 ; SSE-NEXT: movapd %xmm13, 32(%r8)
3828 ; SSE-NEXT: movapd %xmm8, 16(%r8)
3829 ; SSE-NEXT: movapd %xmm0, (%r8)
3830 ; SSE-NEXT: addq $824, %rsp # imm = 0x338
3833 ; AVX-LABEL: load_i16_stride4_vf64:
3835 ; AVX-NEXT: subq $776, %rsp # imm = 0x308
3836 ; AVX-NEXT: vpxor %xmm10, %xmm10, %xmm10
3837 ; AVX-NEXT: vmovdqa 368(%rdi), %xmm8
3838 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm10[1,2,3],xmm8[4],xmm10[5,6,7]
3839 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3840 ; AVX-NEXT: vmovdqa 352(%rdi), %xmm1
3841 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3842 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7]
3843 ; AVX-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
3844 ; AVX-NEXT: vmovdqa 336(%rdi), %xmm4
3845 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm10[1,2,3],xmm4[4],xmm10[5,6,7]
3846 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3847 ; AVX-NEXT: vmovdqa 320(%rdi), %xmm7
3848 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm10[1,2,3],xmm7[4],xmm10[5,6,7]
3849 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3850 ; AVX-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
3851 ; AVX-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
3852 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3853 ; AVX-NEXT: vmovdqa 304(%rdi), %xmm13
3854 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm10[1,2,3],xmm13[4],xmm10[5,6,7]
3855 ; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3856 ; AVX-NEXT: vmovdqa 288(%rdi), %xmm1
3857 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3858 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7]
3859 ; AVX-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
3860 ; AVX-NEXT: vmovdqa 272(%rdi), %xmm14
3861 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm10[1,2,3],xmm14[4],xmm10[5,6,7]
3862 ; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3863 ; AVX-NEXT: vmovdqa 256(%rdi), %xmm2
3864 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3865 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7]
3866 ; AVX-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
3867 ; AVX-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
3868 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3869 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm0
3870 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3871 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7]
3872 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm0
3873 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3874 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7]
3875 ; AVX-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
3876 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm3
3877 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm10[1,2,3],xmm3[4],xmm10[5,6,7]
3878 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm0
3879 ; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
3880 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7]
3881 ; AVX-NEXT: vpackusdw %xmm5, %xmm6, %xmm5
3882 ; AVX-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
3883 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3884 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm0
3885 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3886 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
3887 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3888 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7]
3889 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7]
3890 ; AVX-NEXT: vpackusdw %xmm5, %xmm6, %xmm5
3891 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3892 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3893 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
3894 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3895 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7]
3896 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7]
3897 ; AVX-NEXT: vpackusdw %xmm9, %xmm11, %xmm9
3898 ; AVX-NEXT: vpackusdw %xmm5, %xmm9, %xmm5
3899 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3900 ; AVX-NEXT: vmovdqa 240(%rdi), %xmm0
3901 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3902 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7]
3903 ; AVX-NEXT: vmovdqa 224(%rdi), %xmm15
3904 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0],xmm10[1,2,3],xmm15[4],xmm10[5,6,7]
3905 ; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3906 ; AVX-NEXT: vpackusdw %xmm5, %xmm9, %xmm5
3907 ; AVX-NEXT: vmovdqa 208(%rdi), %xmm0
3908 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3909 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7]
3910 ; AVX-NEXT: vmovdqa 192(%rdi), %xmm0
3911 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7]
3912 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3913 ; AVX-NEXT: vpackusdw %xmm9, %xmm11, %xmm9
3914 ; AVX-NEXT: vpackusdw %xmm5, %xmm9, %xmm5
3915 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3916 ; AVX-NEXT: vmovdqa 176(%rdi), %xmm2
3917 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3918 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7]
3919 ; AVX-NEXT: vmovdqa 160(%rdi), %xmm2
3920 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3921 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7]
3922 ; AVX-NEXT: vpackusdw %xmm5, %xmm9, %xmm5
3923 ; AVX-NEXT: vmovdqa 144(%rdi), %xmm9
3924 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3925 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2,3],xmm9[4],xmm10[5,6,7]
3926 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm11
3927 ; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3928 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm10[1,2,3],xmm11[4],xmm10[5,6,7]
3929 ; AVX-NEXT: vpackusdw %xmm9, %xmm11, %xmm9
3930 ; AVX-NEXT: vpackusdw %xmm5, %xmm9, %xmm5
3931 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3932 ; AVX-NEXT: vmovdqa 496(%rdi), %xmm1
3933 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3934 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7]
3935 ; AVX-NEXT: vmovdqa 480(%rdi), %xmm1
3936 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3937 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7]
3938 ; AVX-NEXT: vpackusdw %xmm9, %xmm11, %xmm9
3939 ; AVX-NEXT: vmovdqa 464(%rdi), %xmm1
3940 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3941 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7]
3942 ; AVX-NEXT: vmovdqa 448(%rdi), %xmm1
3943 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3944 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7]
3945 ; AVX-NEXT: vpackusdw %xmm11, %xmm12, %xmm11
3946 ; AVX-NEXT: vpackusdw %xmm9, %xmm11, %xmm9
3947 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3948 ; AVX-NEXT: vmovdqa 432(%rdi), %xmm2
3949 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3950 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7]
3951 ; AVX-NEXT: vmovdqa 416(%rdi), %xmm1
3952 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3953 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7]
3954 ; AVX-NEXT: vpackusdw %xmm9, %xmm11, %xmm9
3955 ; AVX-NEXT: vmovdqa 400(%rdi), %xmm5
3956 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3957 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0],xmm10[1,2,3],xmm5[4],xmm10[5,6,7]
3958 ; AVX-NEXT: vmovdqa 384(%rdi), %xmm2
3959 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3960 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7]
3961 ; AVX-NEXT: vpackusdw %xmm11, %xmm10, %xmm10
3962 ; AVX-NEXT: vpackusdw %xmm9, %xmm10, %xmm9
3963 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3964 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,2,2,3]
3965 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
3966 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
3967 ; AVX-NEXT: # xmm10 = mem[0,2,2,3]
3968 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
3969 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
3970 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,2,2,3]
3971 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
3972 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,2,2,3]
3973 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
3974 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
3975 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7]
3976 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[0,2,2,3]
3977 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
3978 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
3979 ; AVX-NEXT: # xmm11 = mem[0,2,2,3]
3980 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7]
3981 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
3982 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm14[0,2,2,3]
3983 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
3984 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
3985 ; AVX-NEXT: # xmm12 = mem[0,2,2,3]
3986 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
3987 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
3988 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7]
3989 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
3990 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
3991 ; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3992 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3993 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,2,2,3]
3994 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
3995 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
3996 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,2,2,3]
3997 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
3998 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
3999 ; AVX-NEXT: vmovdqa %xmm3, %xmm4
4000 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[0,2,2,3]
4001 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
4002 ; AVX-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload
4003 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,2,2,3]
4004 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
4005 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
4006 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7]
4007 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4008 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3]
4009 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
4010 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4011 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[0,2,2,3]
4012 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7]
4013 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
4014 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4015 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,2,2,3]
4016 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
4017 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4018 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[0,2,2,3]
4019 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
4020 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
4021 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7]
4022 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
4023 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
4024 ; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4025 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
4026 ; AVX-NEXT: # xmm9 = mem[0,2,2,3]
4027 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
4028 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[0,2,2,3]
4029 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
4030 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
4031 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
4032 ; AVX-NEXT: # xmm10 = mem[0,2,2,3]
4033 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
4034 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,2,2,3]
4035 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
4036 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
4037 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7]
4038 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
4039 ; AVX-NEXT: # xmm10 = mem[0,2,2,3]
4040 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
4041 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
4042 ; AVX-NEXT: # xmm11 = mem[0,2,2,3]
4043 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7]
4044 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
4045 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
4046 ; AVX-NEXT: # xmm11 = mem[0,2,2,3]
4047 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
4048 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
4049 ; AVX-NEXT: # xmm12 = mem[0,2,2,3]
4050 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
4051 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
4052 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7]
4053 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
4054 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7]
4055 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4056 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4057 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[0,2,2,3]
4058 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
4059 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4060 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm14[0,2,2,3]
4061 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
4062 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
4063 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
4064 ; AVX-NEXT: # xmm10 = mem[0,2,2,3]
4065 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
4066 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4067 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm15[0,2,2,3]
4068 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
4069 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
4070 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7]
4071 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
4072 ; AVX-NEXT: # xmm10 = mem[0,2,2,3]
4073 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
4074 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
4075 ; AVX-NEXT: # xmm11 = mem[0,2,2,3]
4076 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7]
4077 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
4078 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
4079 ; AVX-NEXT: # xmm11 = mem[0,2,2,3]
4080 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
4081 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
4082 ; AVX-NEXT: # xmm12 = mem[0,2,2,3]
4083 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
4084 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
4085 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7]
4086 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
4087 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7]
4088 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4089 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
4090 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4091 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
4092 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4093 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
4094 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
4095 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4096 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,1,2,3]
4097 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4098 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3]
4099 ; AVX-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
4100 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
4101 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
4102 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
4103 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
4104 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3]
4105 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4106 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3]
4107 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4108 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
4109 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
4110 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4111 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[3,1,2,3]
4112 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4113 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[3,1,2,3]
4114 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4115 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
4116 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7]
4117 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4118 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
4119 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4120 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4121 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4122 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4123 ; AVX-NEXT: # xmm0 = mem[3,1,2,3]
4124 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4125 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4126 ; AVX-NEXT: # xmm1 = mem[3,1,2,3]
4127 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4128 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
4129 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
4130 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4131 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4132 ; AVX-NEXT: # xmm1 = mem[3,1,2,3]
4133 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4134 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4135 ; AVX-NEXT: # xmm2 = mem[3,1,2,3]
4136 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4137 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
4138 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
4139 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4140 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
4141 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4142 ; AVX-NEXT: # xmm1 = mem[3,1,2,3]
4143 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4144 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4145 ; AVX-NEXT: # xmm2 = mem[3,1,2,3]
4146 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4147 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
4148 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
4149 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4150 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
4151 ; AVX-NEXT: # xmm12 = mem[3,1,2,3]
4152 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
4153 ; AVX-NEXT: # xmm11 = mem[3,1,2,3]
4154 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,0,2,3,4,5,6,7]
4155 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,0,2,3,4,5,6,7]
4156 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4157 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
4158 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4159 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4160 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4161 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[3,1,2,3]
4162 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4163 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3]
4164 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4165 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
4166 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7]
4167 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4168 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4169 ; AVX-NEXT: # xmm1 = mem[3,1,2,3]
4170 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4171 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3]
4172 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4173 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
4174 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
4175 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4176 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
4177 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4178 ; AVX-NEXT: # xmm1 = mem[3,1,2,3]
4179 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4180 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4181 ; AVX-NEXT: # xmm2 = mem[3,1,2,3]
4182 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4183 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
4184 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
4185 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4186 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
4187 ; AVX-NEXT: # xmm13 = mem[3,1,2,3]
4188 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
4189 ; AVX-NEXT: # xmm14 = mem[3,1,2,3]
4190 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,0,2,3,4,5,6,7]
4191 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[2,0,2,3,4,5,6,7]
4192 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4193 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
4194 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4195 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4196 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4197 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
4198 ; AVX-NEXT: # xmm10 = mem[3,1,2,3]
4199 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
4200 ; AVX-NEXT: # xmm9 = mem[3,1,2,3]
4201 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,2,0,4,5,6,7]
4202 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,2,0,4,5,6,7]
4203 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4204 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
4205 ; AVX-NEXT: # xmm8 = mem[3,1,2,3]
4206 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
4207 ; AVX-NEXT: # xmm7 = mem[3,1,2,3]
4208 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,0,2,3,4,5,6,7]
4209 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7]
4210 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4211 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm0[4,5,6,7]
4212 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
4213 ; AVX-NEXT: # xmm6 = mem[3,1,2,3]
4214 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4215 ; AVX-NEXT: # xmm5 = mem[3,1,2,3]
4216 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7]
4217 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7]
4218 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4219 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4220 ; AVX-NEXT: # xmm4 = mem[3,1,2,3]
4221 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4222 ; AVX-NEXT: # xmm2 = mem[3,1,2,3]
4223 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7]
4224 ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7]
4225 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
4226 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
4227 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1
4228 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4229 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4230 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4231 ; AVX-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
4232 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4233 ; AVX-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
4234 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4235 ; AVX-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4236 ; AVX-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
4237 ; AVX-NEXT: vpshuflw $231, (%rsp), %xmm3 # 16-byte Folded Reload
4238 ; AVX-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
4239 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
4240 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
4241 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4242 ; AVX-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
4243 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4244 ; AVX-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7]
4245 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
4246 ; AVX-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4247 ; AVX-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
4248 ; AVX-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4249 ; AVX-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7]
4250 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm3[0],xmm15[1],xmm3[1]
4251 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
4252 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4253 ; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4254 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4255 ; AVX-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
4256 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4257 ; AVX-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
4258 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4259 ; AVX-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4260 ; AVX-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
4261 ; AVX-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4262 ; AVX-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
4263 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
4264 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
4265 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4266 ; AVX-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
4267 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4268 ; AVX-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7]
4269 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
4270 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7]
4271 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
4272 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1]
4273 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
4274 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4275 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4276 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,3,1,4,5,6,7]
4277 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,3,1,4,5,6,7]
4278 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4279 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[3,1,2,3,4,5,6,7]
4280 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
4281 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
4282 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
4283 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,3,1,4,5,6,7]
4284 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
4285 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
4286 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
4287 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
4288 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
4289 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
4290 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4291 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4292 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4293 ; AVX-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
4294 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4295 ; AVX-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7]
4296 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4297 ; AVX-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4298 ; AVX-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7]
4299 ; AVX-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4300 ; AVX-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7]
4301 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
4302 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
4303 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4304 ; AVX-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7]
4305 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4306 ; AVX-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7]
4307 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
4308 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[3,1,2,3,4,5,6,7]
4309 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[3,1,2,3,4,5,6,7]
4310 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
4311 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
4312 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
4313 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
4314 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4315 ; AVX-NEXT: vmovaps %xmm2, 96(%rsi)
4316 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4317 ; AVX-NEXT: vmovaps %xmm2, 112(%rsi)
4318 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4319 ; AVX-NEXT: vmovaps %xmm2, 32(%rsi)
4320 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4321 ; AVX-NEXT: vmovaps %xmm2, 48(%rsi)
4322 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4323 ; AVX-NEXT: vmovaps %xmm2, (%rsi)
4324 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4325 ; AVX-NEXT: vmovaps %xmm2, 16(%rsi)
4326 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4327 ; AVX-NEXT: vmovaps %xmm2, 64(%rsi)
4328 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4329 ; AVX-NEXT: vmovaps %xmm2, 80(%rsi)
4330 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4331 ; AVX-NEXT: vmovaps %ymm2, 96(%rdx)
4332 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4333 ; AVX-NEXT: vmovaps %ymm2, 32(%rdx)
4334 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4335 ; AVX-NEXT: vmovaps %ymm2, (%rdx)
4336 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4337 ; AVX-NEXT: vmovaps %ymm2, 64(%rdx)
4338 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4339 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx)
4340 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4341 ; AVX-NEXT: vmovaps %ymm2, 96(%rcx)
4342 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4343 ; AVX-NEXT: vmovaps %ymm2, 64(%rcx)
4344 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4345 ; AVX-NEXT: vmovaps %ymm2, (%rcx)
4346 ; AVX-NEXT: vmovaps %ymm1, 96(%r8)
4347 ; AVX-NEXT: vmovaps %ymm0, 32(%r8)
4348 ; AVX-NEXT: vmovaps %ymm3, 64(%r8)
4349 ; AVX-NEXT: vmovaps %ymm15, (%r8)
4350 ; AVX-NEXT: addq $776, %rsp # imm = 0x308
4351 ; AVX-NEXT: vzeroupper
4354 ; AVX2-LABEL: load_i16_stride4_vf64:
4356 ; AVX2-NEXT: subq $696, %rsp # imm = 0x2B8
4357 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
4358 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4359 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
4360 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4361 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4362 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
4363 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4364 ; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
4365 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4366 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
4367 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4368 ; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
4369 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4370 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4371 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
4372 ; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
4373 ; AVX2-NEXT: vpackusdw %xmm3, %xmm3, %xmm3
4374 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
4375 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
4376 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
4377 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4378 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4379 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
4380 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4381 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4382 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
4383 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4384 ; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
4385 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4386 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
4387 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4388 ; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
4389 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4390 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4391 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
4392 ; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
4393 ; AVX2-NEXT: vpackusdw %xmm3, %xmm3, %xmm3
4394 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
4395 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
4396 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
4397 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4398 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4399 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
4400 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4401 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4402 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
4403 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4404 ; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
4405 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4406 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
4407 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4408 ; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
4409 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4410 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4411 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
4412 ; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
4413 ; AVX2-NEXT: vpackusdw %xmm3, %xmm3, %xmm3
4414 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
4415 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
4416 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
4417 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4418 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4419 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
4420 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4421 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4422 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
4423 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4424 ; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
4425 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4426 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
4427 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4428 ; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
4429 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4430 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4431 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
4432 ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
4433 ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
4434 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4435 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
4436 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4437 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4438 ; AVX2-NEXT: vmovdqa 256(%rdi), %xmm4
4439 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4440 ; AVX2-NEXT: vmovdqa 272(%rdi), %xmm3
4441 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4442 ; AVX2-NEXT: vmovdqa 288(%rdi), %xmm5
4443 ; AVX2-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill
4444 ; AVX2-NEXT: vmovdqa 304(%rdi), %xmm6
4445 ; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4446 ; AVX2-NEXT: vmovdqa 368(%rdi), %xmm0
4447 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4448 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4449 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
4450 ; AVX2-NEXT: vmovdqa 352(%rdi), %xmm1
4451 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4452 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4453 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
4454 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4455 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4456 ; AVX2-NEXT: vmovdqa 336(%rdi), %xmm1
4457 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4458 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4459 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
4460 ; AVX2-NEXT: vmovdqa 320(%rdi), %xmm2
4461 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4462 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
4463 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
4464 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4465 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4466 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4467 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,2,2,3]
4468 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
4469 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,2,2,3]
4470 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
4471 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4472 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
4473 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
4474 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
4475 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
4476 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4477 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
4478 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4479 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4480 ; AVX2-NEXT: vmovdqa 112(%rdi), %xmm0
4481 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4482 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4483 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
4484 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm1
4485 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4486 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4487 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
4488 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4489 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm1
4490 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4491 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4492 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
4493 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2
4494 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4495 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
4496 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
4497 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4498 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4499 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4500 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4501 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm14
4502 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,2,2,3]
4503 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
4504 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm8
4505 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,2,2,3]
4506 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
4507 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4508 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm12
4509 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,2,2,3]
4510 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
4511 ; AVX2-NEXT: vmovdqa (%rdi), %xmm9
4512 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,2,2,3]
4513 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
4514 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
4515 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
4516 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4517 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4518 ; AVX2-NEXT: vmovdqa 240(%rdi), %xmm0
4519 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4520 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4521 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
4522 ; AVX2-NEXT: vmovdqa 224(%rdi), %xmm1
4523 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4524 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4525 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
4526 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4527 ; AVX2-NEXT: vmovdqa 208(%rdi), %xmm1
4528 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4529 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4530 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
4531 ; AVX2-NEXT: vmovdqa 192(%rdi), %xmm2
4532 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4533 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
4534 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
4535 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4536 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4537 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4538 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4539 ; AVX2-NEXT: vmovdqa 176(%rdi), %xmm1
4540 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4541 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4542 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
4543 ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm2
4544 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4545 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
4546 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
4547 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4548 ; AVX2-NEXT: vmovdqa 144(%rdi), %xmm2
4549 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4550 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
4551 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
4552 ; AVX2-NEXT: vmovdqa 128(%rdi), %xmm3
4553 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4554 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,2,2,3]
4555 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
4556 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
4557 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
4558 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4559 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4560 ; AVX2-NEXT: vmovdqa 496(%rdi), %xmm11
4561 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[0,2,2,3]
4562 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
4563 ; AVX2-NEXT: vmovdqa 480(%rdi), %xmm13
4564 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,2,2,3]
4565 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
4566 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4567 ; AVX2-NEXT: vmovdqa 464(%rdi), %xmm15
4568 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,2,2,3]
4569 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
4570 ; AVX2-NEXT: vmovdqa 448(%rdi), %xmm6
4571 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
4572 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
4573 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
4574 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4575 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4576 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4577 ; AVX2-NEXT: vmovdqa 432(%rdi), %xmm5
4578 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,2,2,3]
4579 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
4580 ; AVX2-NEXT: vmovdqa 416(%rdi), %xmm4
4581 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,2,2,3]
4582 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
4583 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
4584 ; AVX2-NEXT: vmovdqa 400(%rdi), %xmm7
4585 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3]
4586 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[1,3,2,3,4,5,6,7]
4587 ; AVX2-NEXT: vmovdqa 384(%rdi), %xmm1
4588 ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,2,2,3]
4589 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
4590 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
4591 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
4592 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4593 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4594 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4595 ; AVX2-NEXT: # xmm0 = mem[3,1,2,3]
4596 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4597 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4598 ; AVX2-NEXT: # xmm2 = mem[3,1,2,3]
4599 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4600 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
4601 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
4602 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
4603 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4604 ; AVX2-NEXT: # xmm2 = mem[3,1,2,3]
4605 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4606 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4607 ; AVX2-NEXT: # xmm3 = mem[3,1,2,3]
4608 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4609 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
4610 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7]
4611 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4612 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4613 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4614 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
4615 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3]
4616 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4617 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3]
4618 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4619 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
4620 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7]
4621 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4622 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[3,1,2,3]
4623 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4624 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[3,1,2,3]
4625 ; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4626 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7]
4627 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,0,2,3,4,5,6,7]
4628 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1]
4629 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
4630 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
4631 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4632 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4633 ; AVX2-NEXT: # xmm0 = mem[3,1,2,3]
4634 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4635 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4636 ; AVX2-NEXT: # xmm2 = mem[3,1,2,3]
4637 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4638 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
4639 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
4640 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
4641 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4642 ; AVX2-NEXT: # xmm2 = mem[3,1,2,3]
4643 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4644 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4645 ; AVX2-NEXT: # xmm3 = mem[3,1,2,3]
4646 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4647 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
4648 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7]
4649 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4650 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4651 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4652 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
4653 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4654 ; AVX2-NEXT: # xmm2 = mem[3,1,2,3]
4655 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4656 ; AVX2-NEXT: vpshufd $231, (%rsp), %xmm3 # 16-byte Folded Reload
4657 ; AVX2-NEXT: # xmm3 = mem[3,1,2,3]
4658 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4659 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
4660 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7]
4661 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4662 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
4663 ; AVX2-NEXT: # xmm12 = mem[3,1,2,3]
4664 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
4665 ; AVX2-NEXT: # xmm10 = mem[3,1,2,3]
4666 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,0,2,3,4,5,6,7]
4667 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm10[2,0,2,3,4,5,6,7]
4668 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1]
4669 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
4670 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
4671 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4672 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3]
4673 ; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
4674 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[3,1,2,3]
4675 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4676 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
4677 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
4678 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
4679 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3]
4680 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4681 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,1,2,3]
4682 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4683 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
4684 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7]
4685 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4686 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4687 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4688 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
4689 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3]
4690 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4691 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[3,1,2,3]
4692 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4693 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
4694 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7]
4695 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4696 ; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[3,1,2,3]
4697 ; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[3,1,2,3]
4698 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[2,0,2,3,4,5,6,7]
4699 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,0,2,3,4,5,6,7]
4700 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
4701 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
4702 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4703 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4704 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
4705 ; AVX2-NEXT: # xmm9 = mem[3,1,2,3]
4706 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
4707 ; AVX2-NEXT: # xmm8 = mem[3,1,2,3]
4708 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,1,2,0,4,5,6,7]
4709 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7]
4710 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4711 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
4712 ; AVX2-NEXT: # xmm7 = mem[3,1,2,3]
4713 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
4714 ; AVX2-NEXT: # xmm6 = mem[3,1,2,3]
4715 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,0,2,3,4,5,6,7]
4716 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,0,2,3,4,5,6,7]
4717 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4718 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4719 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4720 ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4721 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4722 ; AVX2-NEXT: # xmm5 = mem[3,1,2,3]
4723 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4724 ; AVX2-NEXT: # xmm4 = mem[3,1,2,3]
4725 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,2,0,4,5,6,7]
4726 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,2,0,4,5,6,7]
4727 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4728 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4729 ; AVX2-NEXT: # xmm3 = mem[3,1,2,3]
4730 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4731 ; AVX2-NEXT: # xmm2 = mem[3,1,2,3]
4732 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[2,0,2,3,4,5,6,7]
4733 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7]
4734 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
4735 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
4736 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
4737 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4738 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4739 ; AVX2-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
4740 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4741 ; AVX2-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
4742 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4743 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4744 ; AVX2-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
4745 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
4746 ; AVX2-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7]
4747 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
4748 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4749 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4750 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4751 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4752 ; AVX2-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
4753 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
4754 ; AVX2-NEXT: # xmm14 = mem[0,1,3,1,4,5,6,7]
4755 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
4756 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
4757 ; AVX2-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7]
4758 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4759 ; AVX2-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7]
4760 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
4761 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
4762 ; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4763 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4764 ; AVX2-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
4765 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4766 ; AVX2-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
4767 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4768 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4769 ; AVX2-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
4770 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
4771 ; AVX2-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7]
4772 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
4773 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4774 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4775 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4776 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4777 ; AVX2-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
4778 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
4779 ; AVX2-NEXT: # xmm14 = mem[0,1,3,1,4,5,6,7]
4780 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
4781 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7]
4782 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7]
4783 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1]
4784 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
4785 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4786 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,1,3,1,4,5,6,7]
4787 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7]
4788 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
4789 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
4790 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
4791 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
4792 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4793 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
4794 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
4795 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
4796 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
4797 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
4798 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
4799 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
4800 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
4801 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
4802 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
4803 ; AVX2-NEXT: vpshuflw $116, (%rsp), %xmm2 # 16-byte Folded Reload
4804 ; AVX2-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7]
4805 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4806 ; AVX2-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7]
4807 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4808 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4809 ; AVX2-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
4810 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4811 ; AVX2-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7]
4812 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
4813 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4814 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
4815 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
4816 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4817 ; AVX2-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7]
4818 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4819 ; AVX2-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7]
4820 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
4821 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[3,1,2,3,4,5,6,7]
4822 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7]
4823 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
4824 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
4825 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
4826 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4827 ; AVX2-NEXT: vmovaps %ymm3, 32(%rsi)
4828 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4829 ; AVX2-NEXT: vmovaps %ymm3, 96(%rsi)
4830 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4831 ; AVX2-NEXT: vmovaps %ymm3, 64(%rsi)
4832 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4833 ; AVX2-NEXT: vmovaps %ymm3, (%rsi)
4834 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4835 ; AVX2-NEXT: vmovaps %ymm3, 96(%rdx)
4836 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4837 ; AVX2-NEXT: vmovaps %ymm3, 32(%rdx)
4838 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4839 ; AVX2-NEXT: vmovaps %ymm3, (%rdx)
4840 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4841 ; AVX2-NEXT: vmovaps %ymm3, 64(%rdx)
4842 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4843 ; AVX2-NEXT: vmovaps %ymm3, 32(%rcx)
4844 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4845 ; AVX2-NEXT: vmovaps %ymm3, 96(%rcx)
4846 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4847 ; AVX2-NEXT: vmovaps %ymm3, 64(%rcx)
4848 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4849 ; AVX2-NEXT: vmovaps %ymm3, (%rcx)
4850 ; AVX2-NEXT: vmovdqa %ymm2, 96(%r8)
4851 ; AVX2-NEXT: vmovdqa %ymm0, 32(%r8)
4852 ; AVX2-NEXT: vmovdqa %ymm1, 64(%r8)
4853 ; AVX2-NEXT: vmovdqa %ymm15, (%r8)
4854 ; AVX2-NEXT: addq $696, %rsp # imm = 0x2B8
4855 ; AVX2-NEXT: vzeroupper
4858 ; AVX2-FP-LABEL: load_i16_stride4_vf64:
4860 ; AVX2-FP-NEXT: subq $712, %rsp # imm = 0x2C8
4861 ; AVX2-FP-NEXT: vpxor %xmm0, %xmm0, %xmm0
4862 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4863 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
4864 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4865 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4866 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
4867 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4868 ; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
4869 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4870 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
4871 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4872 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
4873 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4874 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4875 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
4876 ; AVX2-FP-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
4877 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm3, %xmm3
4878 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
4879 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
4880 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
4881 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4882 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4883 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
4884 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4885 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4886 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
4887 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4888 ; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
4889 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4890 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
4891 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4892 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
4893 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4894 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4895 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
4896 ; AVX2-FP-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
4897 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm3, %xmm3
4898 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
4899 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
4900 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
4901 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4902 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4903 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
4904 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4905 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4906 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
4907 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4908 ; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
4909 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4910 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
4911 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4912 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
4913 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4914 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4915 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
4916 ; AVX2-FP-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
4917 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm3, %xmm3
4918 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
4919 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
4920 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
4921 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4922 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4923 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
4924 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4925 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4926 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
4927 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4928 ; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
4929 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4930 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
4931 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4932 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
4933 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4934 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4935 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm3
4936 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
4937 ; AVX2-FP-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
4938 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4939 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
4940 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4941 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4942 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm5
4943 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4944 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm3
4945 ; AVX2-FP-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill
4946 ; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm7
4947 ; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4948 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
4949 ; AVX2-FP-NEXT: vmovdqa 368(%rdi), %xmm0
4950 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4951 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
4952 ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %xmm1
4953 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4954 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm1, %xmm1
4955 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4956 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4957 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
4958 ; AVX2-FP-NEXT: vmovdqa 336(%rdi), %xmm1
4959 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4960 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
4961 ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm2
4962 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4963 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
4964 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4965 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4966 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4967 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm7, %xmm1
4968 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm2
4969 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4970 ; AVX2-FP-NEXT: vmovdqa 272(%rdi), %xmm2
4971 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4972 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
4973 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm3
4974 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4975 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
4976 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4977 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4978 ; AVX2-FP-NEXT: vmovdqa 112(%rdi), %xmm5
4979 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm5, %xmm2
4980 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm11
4981 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm11, %xmm3
4982 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4983 ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm12
4984 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm12, %xmm7
4985 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm15
4986 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm15, %xmm8
4987 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
4988 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
4989 ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
4990 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm3[6,7]
4991 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm0
4992 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4993 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm3
4994 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm8
4995 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm9
4996 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
4997 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
4998 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4999 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
5000 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5001 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm9
5002 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm10
5003 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
5004 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
5005 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7]
5006 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5007 ; AVX2-FP-NEXT: vmovdqa 240(%rdi), %xmm0
5008 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5009 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm7
5010 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %xmm0
5011 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5012 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm8
5013 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
5014 ; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm0
5015 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5016 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm8
5017 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm0
5018 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5019 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm9
5020 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
5021 ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
5022 ; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
5023 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
5024 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm0
5025 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5026 ; AVX2-FP-NEXT: vmovdqa 176(%rdi), %xmm1
5027 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5028 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm1, %xmm8
5029 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm9
5030 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
5031 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm0
5032 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5033 ; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm1
5034 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5035 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm9
5036 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm10
5037 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
5038 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
5039 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7]
5040 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5041 ; AVX2-FP-NEXT: vmovdqa 496(%rdi), %xmm8
5042 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm8, %xmm9
5043 ; AVX2-FP-NEXT: vmovdqa 480(%rdi), %xmm7
5044 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm7, %xmm10
5045 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
5046 ; AVX2-FP-NEXT: vmovdqa 464(%rdi), %xmm10
5047 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm10, %xmm14
5048 ; AVX2-FP-NEXT: vmovdqa 448(%rdi), %xmm0
5049 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5050 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm9
5051 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1]
5052 ; AVX2-FP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
5053 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
5054 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7]
5055 ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %xmm14
5056 ; AVX2-FP-NEXT: vmovdqa 432(%rdi), %xmm13
5057 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm13, %xmm0
5058 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm14, %xmm4
5059 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
5060 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %xmm0
5061 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5062 ; AVX2-FP-NEXT: vmovdqa 400(%rdi), %xmm4
5063 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm4, %xmm1
5064 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm6
5065 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
5066 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
5067 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
5068 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5069 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,1,2,3]
5070 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5071 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[3,1,2,3]
5072 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5073 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
5074 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
5075 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5076 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[3,1,2,3]
5077 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5078 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3]
5079 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5080 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
5081 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
5082 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5083 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5084 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5085 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5086 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,1,2,3]
5087 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5088 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5089 ; AVX2-FP-NEXT: # xmm2 = mem[3,1,2,3]
5090 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5091 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
5092 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
5093 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5094 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5095 ; AVX2-FP-NEXT: # xmm2 = mem[3,1,2,3]
5096 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5097 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5098 ; AVX2-FP-NEXT: # xmm3 = mem[3,1,2,3]
5099 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5100 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
5101 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7]
5102 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5103 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
5104 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5105 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5106 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
5107 ; AVX2-FP-NEXT: # xmm0 = mem[3,1,2,3]
5108 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5109 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5110 ; AVX2-FP-NEXT: # xmm1 = mem[3,1,2,3]
5111 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5112 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
5113 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
5114 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5115 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5116 ; AVX2-FP-NEXT: # xmm1 = mem[3,1,2,3]
5117 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5118 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5119 ; AVX2-FP-NEXT: # xmm2 = mem[3,1,2,3]
5120 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5121 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
5122 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
5123 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5124 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5125 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5126 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5127 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5128 ; AVX2-FP-NEXT: # xmm1 = mem[3,1,2,3]
5129 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5130 ; AVX2-FP-NEXT: vpshufd $231, (%rsp), %xmm15 # 16-byte Folded Reload
5131 ; AVX2-FP-NEXT: # xmm15 = mem[3,1,2,3]
5132 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
5133 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,1,2,0,4,5,6,7]
5134 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5135 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
5136 ; AVX2-FP-NEXT: # xmm12 = mem[3,1,2,3]
5137 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
5138 ; AVX2-FP-NEXT: # xmm11 = mem[3,1,2,3]
5139 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,0,2,3,4,5,6,7]
5140 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,0,2,3,4,5,6,7]
5141 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5142 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
5143 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5144 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5145 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3]
5146 ; AVX2-FP-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
5147 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[3,1,2,3]
5148 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5149 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
5150 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
5151 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5152 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3]
5153 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5154 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5155 ; AVX2-FP-NEXT: # xmm2 = mem[3,1,2,3]
5156 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5157 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
5158 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
5159 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5160 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5161 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5162 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5163 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3]
5164 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5165 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3]
5166 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5167 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
5168 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
5169 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5170 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[3,1,2,3]
5171 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5172 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
5173 ; AVX2-FP-NEXT: # xmm10 = mem[3,1,2,3]
5174 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
5175 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,0,2,3,4,5,6,7]
5176 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5177 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
5178 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5179 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5180 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
5181 ; AVX2-FP-NEXT: # xmm14 = mem[3,1,2,3]
5182 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
5183 ; AVX2-FP-NEXT: # xmm9 = mem[3,1,2,3]
5184 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,1,2,0,4,5,6,7]
5185 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,2,0,4,5,6,7]
5186 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5187 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
5188 ; AVX2-FP-NEXT: # xmm8 = mem[3,1,2,3]
5189 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
5190 ; AVX2-FP-NEXT: # xmm7 = mem[3,1,2,3]
5191 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,0,2,3,4,5,6,7]
5192 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7]
5193 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5194 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5195 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5196 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5197 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
5198 ; AVX2-FP-NEXT: # xmm6 = mem[3,1,2,3]
5199 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5200 ; AVX2-FP-NEXT: # xmm5 = mem[3,1,2,3]
5201 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7]
5202 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7]
5203 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5204 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5205 ; AVX2-FP-NEXT: # xmm4 = mem[3,1,2,3]
5206 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5207 ; AVX2-FP-NEXT: # xmm2 = mem[3,1,2,3]
5208 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7]
5209 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,0,2,3,4,5,6,7]
5210 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
5211 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5212 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
5213 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5214 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
5215 ; AVX2-FP-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
5216 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5217 ; AVX2-FP-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
5218 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5219 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5220 ; AVX2-FP-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
5221 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5222 ; AVX2-FP-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
5223 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5224 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5225 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5226 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5227 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5228 ; AVX2-FP-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
5229 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5230 ; AVX2-FP-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7]
5231 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5232 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5233 ; AVX2-FP-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
5234 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
5235 ; AVX2-FP-NEXT: # xmm13 = mem[3,1,2,3,4,5,6,7]
5236 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1]
5237 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
5238 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5239 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
5240 ; AVX2-FP-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
5241 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5242 ; AVX2-FP-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
5243 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5244 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5245 ; AVX2-FP-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
5246 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5247 ; AVX2-FP-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
5248 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5249 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5250 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5251 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5252 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5253 ; AVX2-FP-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
5254 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7]
5255 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5256 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7]
5257 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
5258 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1]
5259 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
5260 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5261 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,1,3,1,4,5,6,7]
5262 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,3,1,4,5,6,7]
5263 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5264 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[3,1,2,3,4,5,6,7]
5265 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7]
5266 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5267 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5268 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5269 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5270 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,3,1,4,5,6,7]
5271 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,3,1,4,5,6,7]
5272 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5273 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[3,1,2,3,4,5,6,7]
5274 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
5275 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
5276 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
5277 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5278 ; AVX2-FP-NEXT: vpshuflw $116, (%rsp), %xmm1 # 16-byte Folded Reload
5279 ; AVX2-FP-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
5280 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5281 ; AVX2-FP-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7]
5282 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5283 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5284 ; AVX2-FP-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7]
5285 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5286 ; AVX2-FP-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
5287 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5288 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5289 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
5290 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5291 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5292 ; AVX2-FP-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7]
5293 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5294 ; AVX2-FP-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7]
5295 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5296 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5297 ; AVX2-FP-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
5298 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,3,4,5,6,7]
5299 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
5300 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
5301 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5302 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5303 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rsi)
5304 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5305 ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rsi)
5306 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5307 ; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rsi)
5308 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5309 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rsi)
5310 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5311 ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rdx)
5312 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5313 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rdx)
5314 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5315 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rdx)
5316 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5317 ; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rdx)
5318 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5319 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rcx)
5320 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5321 ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rcx)
5322 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5323 ; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rcx)
5324 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5325 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rcx)
5326 ; AVX2-FP-NEXT: vmovdqa %ymm1, 96(%r8)
5327 ; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%r8)
5328 ; AVX2-FP-NEXT: vmovdqa %ymm11, 64(%r8)
5329 ; AVX2-FP-NEXT: vmovdqa %ymm13, (%r8)
5330 ; AVX2-FP-NEXT: addq $712, %rsp # imm = 0x2C8
5331 ; AVX2-FP-NEXT: vzeroupper
5332 ; AVX2-FP-NEXT: retq
5334 ; AVX2-FCP-LABEL: load_i16_stride4_vf64:
5335 ; AVX2-FCP: # %bb.0:
5336 ; AVX2-FCP-NEXT: subq $680, %rsp # imm = 0x2A8
5337 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm7
5338 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5339 ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm6
5340 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5341 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm5
5342 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5343 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4
5344 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5345 ; AVX2-FCP-NEXT: vpxor %xmm1, %xmm1, %xmm1
5346 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
5347 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
5348 ; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
5349 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
5350 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
5351 ; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
5352 ; AVX2-FCP-NEXT: vpackusdw %xmm0, %xmm2, %xmm2
5353 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
5354 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm3
5355 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5356 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29]
5357 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
5358 ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm5
5359 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5360 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm5
5361 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
5362 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
5363 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5364 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
5365 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
5366 ; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
5367 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
5368 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
5369 ; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
5370 ; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm3, %xmm3
5371 ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm0, %ymm2
5372 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm5
5373 ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm6
5374 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm7
5375 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7]
5376 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7]
5377 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5378 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
5379 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
5380 ; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
5381 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
5382 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7
5383 ; AVX2-FCP-NEXT: vpackusdw %xmm7, %xmm5, %xmm5
5384 ; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm7
5385 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5386 ; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm5, %xmm3
5387 ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm5
5388 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5389 ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm5
5390 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5391 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm5
5392 ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm14
5393 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm7
5394 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7]
5395 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7]
5396 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5397 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
5398 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
5399 ; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
5400 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
5401 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5
5402 ; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm1, %xmm1
5403 ; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
5404 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm5
5405 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5406 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm3
5407 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5408 ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm9
5409 ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm3
5410 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm0
5411 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm5
5412 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
5413 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5414 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5415 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm1
5416 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5417 ; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm12
5418 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
5419 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm12, %xmm0
5420 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
5421 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5422 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm10
5423 ; AVX2-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5424 ; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm7
5425 ; AVX2-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5426 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
5427 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7
5428 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm8
5429 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
5430 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
5431 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
5432 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2
5433 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6
5434 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
5435 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
5436 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5437 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
5438 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm1
5439 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm6
5440 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm7
5441 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
5442 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm7
5443 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm15
5444 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm11
5445 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm13
5446 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
5447 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3]
5448 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5449 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11
5450 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5451 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm13
5452 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7]
5453 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7]
5454 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5455 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm8
5456 ; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5457 ; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm6
5458 ; AVX2-FCP-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill
5459 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm6
5460 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm11
5461 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm11[0],xmm6[0],xmm11[1],xmm6[1]
5462 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm8
5463 ; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5464 ; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm11
5465 ; AVX2-FCP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5466 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm11
5467 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm13
5468 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
5469 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3]
5470 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9
5471 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3
5472 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7]
5473 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
5474 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5475 ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %xmm6
5476 ; AVX2-FCP-NEXT: vmovdqa 432(%rdi), %xmm13
5477 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm13, %xmm3
5478 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm5
5479 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
5480 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %xmm5
5481 ; AVX2-FCP-NEXT: vmovdqa 400(%rdi), %xmm3
5482 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm11
5483 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm0
5484 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
5485 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3]
5486 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5487 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm9
5488 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm11
5489 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
5490 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
5491 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5492 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
5493 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
5494 ; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5495 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
5496 ; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5497 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm9
5498 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm11
5499 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
5500 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
5501 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5502 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
5503 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5504 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
5505 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
5506 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5507 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3]
5508 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5509 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
5510 ; AVX2-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5511 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
5512 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,0,2,3,4,5,6,7]
5513 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
5514 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
5515 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
5516 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5517 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
5518 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5519 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
5520 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5521 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
5522 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
5523 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5524 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,1,2,3]
5525 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5526 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
5527 ; AVX2-FCP-NEXT: # xmm14 = mem[3,1,2,3]
5528 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
5529 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[0,1,2,0,4,5,6,7]
5530 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
5531 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
5532 ; AVX2-FCP-NEXT: # xmm11 = mem[3,1,2,3]
5533 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
5534 ; AVX2-FCP-NEXT: # xmm9 = mem[3,1,2,3]
5535 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,0,2,3,4,5,6,7]
5536 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[2,0,2,3,4,5,6,7]
5537 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1]
5538 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3]
5539 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5540 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5541 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
5542 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5543 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
5544 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5545 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
5546 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
5547 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5548 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[3,1,2,3]
5549 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5550 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
5551 ; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5552 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
5553 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,0,4,5,6,7]
5554 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
5555 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[3,1,2,3]
5556 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[3,1,2,3]
5557 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[2,0,2,3,4,5,6,7]
5558 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[2,0,2,3,4,5,6,7]
5559 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
5560 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
5561 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5562 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5563 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
5564 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
5565 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm0
5566 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm1
5567 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5568 ; AVX2-FCP-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload
5569 ; AVX2-FCP-NEXT: # xmm6 = mem[3,1,2,3]
5570 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5571 ; AVX2-FCP-NEXT: # xmm5 = mem[3,1,2,3]
5572 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7]
5573 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7]
5574 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5575 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5576 ; AVX2-FCP-NEXT: # xmm4 = mem[3,1,2,3]
5577 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5578 ; AVX2-FCP-NEXT: # xmm2 = mem[3,1,2,3]
5579 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7]
5580 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7]
5581 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
5582 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5583 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
5584 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5585 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5586 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0
5587 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5588 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1
5589 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5590 ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5591 ; AVX2-FCP-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
5592 ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5593 ; AVX2-FCP-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7]
5594 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5595 ; AVX2-FCP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5596 ; AVX2-FCP-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
5597 ; AVX2-FCP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5598 ; AVX2-FCP-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7]
5599 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm3[0],xmm15[1],xmm3[1]
5600 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
5601 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5602 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5603 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0
5604 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5605 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1
5606 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5607 ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5608 ; AVX2-FCP-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
5609 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7]
5610 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5611 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7]
5612 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7]
5613 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1]
5614 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
5615 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5616 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm1
5617 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm3
5618 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
5619 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7]
5620 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
5621 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
5622 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
5623 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
5624 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
5625 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
5626 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5627 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5628 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2
5629 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5630 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3
5631 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
5632 ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5633 ; AVX2-FCP-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7]
5634 ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5635 ; AVX2-FCP-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7]
5636 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
5637 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[3,1,2,3,4,5,6,7]
5638 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[3,1,2,3,4,5,6,7]
5639 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5640 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
5641 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
5642 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5643 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rsi)
5644 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5645 ; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rsi)
5646 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5647 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rsi)
5648 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5649 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi)
5650 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5651 ; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rdx)
5652 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5653 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rdx)
5654 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5655 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rdx)
5656 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5657 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rdx)
5658 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5659 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rcx)
5660 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5661 ; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rcx)
5662 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5663 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rcx)
5664 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5665 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rcx)
5666 ; AVX2-FCP-NEXT: vmovdqa %ymm2, 96(%r8)
5667 ; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%r8)
5668 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%r8)
5669 ; AVX2-FCP-NEXT: vmovdqa %ymm15, (%r8)
5670 ; AVX2-FCP-NEXT: addq $680, %rsp # imm = 0x2A8
5671 ; AVX2-FCP-NEXT: vzeroupper
5672 ; AVX2-FCP-NEXT: retq
5674 ; AVX512-LABEL: load_i16_stride4_vf64:
5676 ; AVX512-NEXT: subq $200, %rsp
5677 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm26
5678 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm27
5679 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm28
5680 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm29
5681 ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm0
5682 ; AVX512-NEXT: vpmovqw %ymm0, %xmm0
5683 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5684 ; AVX512-NEXT: vmovdqa 240(%rdi), %xmm14
5685 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3]
5686 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,0,2,4,5,6,7]
5687 ; AVX512-NEXT: vmovdqa 224(%rdi), %xmm13
5688 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[0,2,2,3]
5689 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7]
5690 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5691 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5692 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
5693 ; AVX512-NEXT: vpmovqw %zmm29, %xmm1
5694 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5695 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm1
5696 ; AVX512-NEXT: vpmovqw %ymm1, %xmm1
5697 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5698 ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm12
5699 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3]
5700 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,0,2,4,5,6,7]
5701 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm11
5702 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3]
5703 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[0,1,0,2,4,5,6,7]
5704 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5705 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
5706 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
5707 ; AVX512-NEXT: vpmovqw %zmm28, %xmm4
5708 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
5709 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3]
5710 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5711 ; AVX512-NEXT: vmovdqa 448(%rdi), %ymm0
5712 ; AVX512-NEXT: vpmovqw %ymm0, %xmm0
5713 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5714 ; AVX512-NEXT: vmovdqa64 496(%rdi), %xmm24
5715 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm24[0,2,2,3]
5716 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,0,2,4,5,6,7]
5717 ; AVX512-NEXT: vmovdqa64 480(%rdi), %xmm23
5718 ; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm23[0,2,2,3]
5719 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,1,0,2,4,5,6,7]
5720 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
5721 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5722 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
5723 ; AVX512-NEXT: vpmovqw %zmm27, %xmm1
5724 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5725 ; AVX512-NEXT: vmovdqa 320(%rdi), %ymm1
5726 ; AVX512-NEXT: vpmovqw %ymm1, %xmm1
5727 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5728 ; AVX512-NEXT: vmovdqa64 368(%rdi), %xmm31
5729 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm31[0,2,2,3]
5730 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,0,2,4,5,6,7]
5731 ; AVX512-NEXT: vmovdqa64 352(%rdi), %xmm25
5732 ; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm25[0,2,2,3]
5733 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[0,1,0,2,4,5,6,7]
5734 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
5735 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5736 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5737 ; AVX512-NEXT: vpmovqw %zmm26, %xmm1
5738 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5739 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[0,1,2,3]
5740 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5741 ; AVX512-NEXT: vmovdqa64 320(%rdi), %xmm30
5742 ; AVX512-NEXT: vmovdqa64 336(%rdi), %xmm17
5743 ; AVX512-NEXT: vmovdqa64 448(%rdi), %xmm18
5744 ; AVX512-NEXT: vmovdqa64 464(%rdi), %xmm19
5745 ; AVX512-NEXT: vmovdqa64 64(%rdi), %xmm20
5746 ; AVX512-NEXT: vmovdqa64 80(%rdi), %xmm21
5747 ; AVX512-NEXT: vmovdqa 192(%rdi), %xmm0
5748 ; AVX512-NEXT: vmovdqa 208(%rdi), %xmm1
5749 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,1,3,4,5,6,7]
5750 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
5751 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
5752 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
5753 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,2,2,3]
5754 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
5755 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,2,2,3]
5756 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
5757 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
5758 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
5759 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
5760 ; AVX512-NEXT: vpsrlq $16, %zmm29, %zmm6
5761 ; AVX512-NEXT: vpmovqw %zmm6, %xmm6
5762 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
5763 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
5764 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,1,3,4,5,6,7]
5765 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
5766 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5767 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm21[0,2,2,3]
5768 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
5769 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm20[0,2,2,3]
5770 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
5771 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
5772 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
5773 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
5774 ; AVX512-NEXT: vpsrlq $16, %zmm28, %zmm6
5775 ; AVX512-NEXT: vpmovqw %zmm6, %xmm6
5776 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
5777 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,2,3]
5778 ; AVX512-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
5779 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,1,3,4,5,6,7]
5780 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,1,3,4,5,6,7]
5781 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5782 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
5783 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm19[0,2,2,3]
5784 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
5785 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm18[0,2,2,3]
5786 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
5787 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
5788 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5789 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
5790 ; AVX512-NEXT: vpsrlq $16, %zmm27, %zmm3
5791 ; AVX512-NEXT: vpmovqw %zmm3, %xmm3
5792 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
5793 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,1,3,4,5,6,7]
5794 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,1,3,4,5,6,7]
5795 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
5796 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5797 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm17[0,2,2,3]
5798 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
5799 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm30[0,2,2,3]
5800 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
5801 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5802 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
5803 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
5804 ; AVX512-NEXT: vpsrlq $16, %zmm26, %zmm4
5805 ; AVX512-NEXT: vpmovqw %zmm4, %xmm4
5806 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
5807 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,2,3]
5808 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5809 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3]
5810 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,2,0,4,5,6,7]
5811 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm22
5812 ; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3]
5813 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,2,0,4,5,6,7]
5814 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
5815 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5816 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[3,1,2,3]
5817 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,0,2,3,4,5,6,7]
5818 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,1,2,3]
5819 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7]
5820 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
5821 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5822 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
5823 ; AVX512-NEXT: vpsrlq $32, %zmm29, %zmm1
5824 ; AVX512-NEXT: vpmovqw %zmm1, %xmm1
5825 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5826 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,1,2,3]
5827 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7]
5828 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3]
5829 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,2,0,4,5,6,7]
5830 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5831 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
5832 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm21[3,1,2,3]
5833 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,0,2,3,4,5,6,7]
5834 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm21
5835 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm20[3,1,2,3]
5836 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,0,2,3,4,5,6,7]
5837 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm16
5838 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
5839 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
5840 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
5841 ; AVX512-NEXT: vpsrlq $32, %zmm28, %zmm8
5842 ; AVX512-NEXT: vpmovqw %zmm8, %xmm8
5843 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
5844 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm1[0,1,2,3]
5845 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5846 ; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm24[3,1,2,3]
5847 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7]
5848 ; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm23[3,1,2,3]
5849 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,2,0,4,5,6,7]
5850 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm20 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
5851 ; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm19[3,1,2,3]
5852 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,0,2,3,4,5,6,7]
5853 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm18[3,1,2,3]
5854 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7]
5855 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm18 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
5856 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm31[3,1,2,3]
5857 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,1,2,0,4,5,6,7]
5858 ; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm25[3,1,2,3]
5859 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[0,1,2,0,4,5,6,7]
5860 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm19 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
5861 ; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm17[3,1,2,3]
5862 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[2,0,2,3,4,5,6,7]
5863 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm30[3,1,2,3]
5864 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm9[2,0,2,3,4,5,6,7]
5865 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
5866 ; AVX512-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm14
5867 ; AVX512-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm5
5868 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7]
5869 ; AVX512-NEXT: vpsrlq $32, %zmm27, %zmm14
5870 ; AVX512-NEXT: vpmovqw %zmm14, %xmm14
5871 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
5872 ; AVX512-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm14
5873 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5874 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7]
5875 ; AVX512-NEXT: vpsrlq $32, %zmm26, %zmm14
5876 ; AVX512-NEXT: vpmovqw %zmm14, %xmm14
5877 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
5878 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm0[0,1,2,3],zmm5[0,1,2,3]
5879 ; AVX512-NEXT: vmovdqa64 %xmm22, %xmm0
5880 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
5881 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[0,1,3,1,4,5,6,7]
5882 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
5883 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[3,1,2,3,4,5,6,7]
5884 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7]
5885 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
5886 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
5887 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
5888 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5889 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5890 ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3
5891 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
5892 ; AVX512-NEXT: vpsrlq $48, %zmm29, %zmm3
5893 ; AVX512-NEXT: vpmovqw %zmm3, %xmm3
5894 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
5895 ; AVX512-NEXT: vmovdqa64 %xmm21, %xmm3
5896 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
5897 ; AVX512-NEXT: vmovdqa64 %xmm16, %xmm5
5898 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
5899 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
5900 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
5901 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5902 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
5903 ; AVX512-NEXT: vpsrlq $48, %zmm28, %zmm3
5904 ; AVX512-NEXT: vpmovqw %zmm3, %xmm3
5905 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
5906 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,2,3]
5907 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,1,3,1,4,5,6,7]
5908 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,3,1,4,5,6,7]
5909 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5910 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7]
5911 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
5912 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
5913 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,3,1,4,5,6,7]
5914 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,3,1,4,5,6,7]
5915 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
5916 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
5917 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5918 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
5919 ; AVX512-NEXT: vpsrlq $48, %zmm27, %zmm2
5920 ; AVX512-NEXT: vpmovqw %zmm2, %xmm2
5921 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5922 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[3,1,2,3,4,5,6,7]
5923 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7]
5924 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
5925 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5926 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
5927 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
5928 ; AVX512-NEXT: vpsrlq $48, %zmm26, %zmm3
5929 ; AVX512-NEXT: vpmovqw %zmm3, %xmm3
5930 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
5931 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[0,1,2,3]
5932 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
5933 ; AVX512-NEXT: vmovaps %zmm2, 64(%rsi)
5934 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
5935 ; AVX512-NEXT: vmovaps %zmm2, (%rsi)
5936 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
5937 ; AVX512-NEXT: vmovaps %zmm2, 64(%rdx)
5938 ; AVX512-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload
5939 ; AVX512-NEXT: vmovaps %zmm2, (%rdx)
5940 ; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rcx)
5941 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
5942 ; AVX512-NEXT: vmovaps %zmm2, (%rcx)
5943 ; AVX512-NEXT: vmovdqa64 %zmm1, 64(%r8)
5944 ; AVX512-NEXT: vmovdqa64 %zmm0, (%r8)
5945 ; AVX512-NEXT: addq $200, %rsp
5946 ; AVX512-NEXT: vzeroupper
5949 ; AVX512-FCP-LABEL: load_i16_stride4_vf64:
5950 ; AVX512-FCP: # %bb.0:
5951 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22
5952 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm25
5953 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
5954 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
5955 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
5956 ; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1]
5957 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,2,2,3,0,2,4,6]
5958 ; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm23
5959 ; AVX512-FCP-NEXT: vpermd %ymm23, %ymm9, %ymm10
5960 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm0
5961 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
5962 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm24
5963 ; AVX512-FCP-NEXT: vpermd %ymm24, %ymm9, %ymm3
5964 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm1
5965 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,2,2,3,4,6,12,14]
5966 ; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm7, %ymm1
5967 ; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm0
5968 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5969 ; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm26
5970 ; AVX512-FCP-NEXT: vpermd %ymm26, %ymm9, %ymm8
5971 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm11
5972 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27
5973 ; AVX512-FCP-NEXT: vpermd %ymm27, %ymm9, %ymm0
5974 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm12
5975 ; AVX512-FCP-NEXT: vpermt2d %ymm11, %ymm7, %ymm12
5976 ; AVX512-FCP-NEXT: vpmovqw %zmm2, %xmm11
5977 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
5978 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm11[0,1,2,3],zmm1[0,1,2,3]
5979 ; AVX512-FCP-NEXT: vmovdqa64 480(%rdi), %ymm28
5980 ; AVX512-FCP-NEXT: vpermd %ymm28, %ymm9, %ymm11
5981 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm1
5982 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %ymm17
5983 ; AVX512-FCP-NEXT: vpermd %ymm17, %ymm9, %ymm12
5984 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm13
5985 ; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm7, %ymm13
5986 ; AVX512-FCP-NEXT: vpmovqw %zmm25, %xmm1
5987 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm13[4,5,6,7]
5988 ; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm18
5989 ; AVX512-FCP-NEXT: vpermd %ymm18, %ymm9, %ymm1
5990 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm15
5991 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm19
5992 ; AVX512-FCP-NEXT: vpermd %ymm19, %ymm9, %ymm13
5993 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm9
5994 ; AVX512-FCP-NEXT: vpermt2d %ymm15, %ymm7, %ymm9
5995 ; AVX512-FCP-NEXT: vpmovqw %zmm22, %xmm15
5996 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7]
5997 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm9[0,1,2,3],zmm14[0,1,2,3]
5998 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
5999 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm10
6000 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
6001 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm10[6,7]
6002 ; AVX512-FCP-NEXT: vpsrlq $16, %zmm4, %zmm10
6003 ; AVX512-FCP-NEXT: vpmovqw %zmm10, %xmm10
6004 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7]
6005 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm8
6006 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
6007 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7]
6008 ; AVX512-FCP-NEXT: vpsrlq $16, %zmm2, %zmm8
6009 ; AVX512-FCP-NEXT: vpmovqw %zmm8, %xmm8
6010 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
6011 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm0[0,1,2,3],zmm3[0,1,2,3]
6012 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm0
6013 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm3
6014 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
6015 ; AVX512-FCP-NEXT: vpsrlq $16, %zmm25, %zmm3
6016 ; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3
6017 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
6018 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1
6019 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm3
6020 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
6021 ; AVX512-FCP-NEXT: vpsrlq $16, %zmm22, %zmm3
6022 ; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3
6023 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
6024 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm1[0,1,2,3],zmm0[0,1,2,3]
6025 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,3,2,3,1,3,5,7]
6026 ; AVX512-FCP-NEXT: vpermd %ymm23, %ymm14, %ymm3
6027 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm0
6028 ; AVX512-FCP-NEXT: vpermd %ymm24, %ymm14, %ymm8
6029 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm1
6030 ; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm7, %ymm1
6031 ; AVX512-FCP-NEXT: vpsrlq $32, %zmm4, %zmm0
6032 ; AVX512-FCP-NEXT: vpmovqw %zmm0, %xmm0
6033 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6034 ; AVX512-FCP-NEXT: vpermd %ymm26, %ymm14, %ymm0
6035 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm12
6036 ; AVX512-FCP-NEXT: vpermd %ymm27, %ymm14, %ymm11
6037 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm13
6038 ; AVX512-FCP-NEXT: vpermt2d %ymm12, %ymm7, %ymm13
6039 ; AVX512-FCP-NEXT: vpsrlq $32, %zmm2, %zmm12
6040 ; AVX512-FCP-NEXT: vpmovqw %zmm12, %xmm12
6041 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
6042 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm12[0,1,2,3],zmm1[0,1,2,3]
6043 ; AVX512-FCP-NEXT: vpermd %ymm28, %ymm14, %ymm12
6044 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm1
6045 ; AVX512-FCP-NEXT: vpermd %ymm17, %ymm14, %ymm13
6046 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm10
6047 ; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm7, %ymm10
6048 ; AVX512-FCP-NEXT: vpsrlq $32, %zmm25, %zmm1
6049 ; AVX512-FCP-NEXT: vpmovqw %zmm1, %xmm1
6050 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7]
6051 ; AVX512-FCP-NEXT: vpermd %ymm18, %ymm14, %ymm1
6052 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm15
6053 ; AVX512-FCP-NEXT: vpermd %ymm19, %ymm14, %ymm5
6054 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm6
6055 ; AVX512-FCP-NEXT: vpermt2d %ymm15, %ymm7, %ymm6
6056 ; AVX512-FCP-NEXT: vpsrlq $32, %zmm22, %zmm7
6057 ; AVX512-FCP-NEXT: vpmovqw %zmm7, %xmm7
6058 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
6059 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm10[0,1,2,3]
6060 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
6061 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm7
6062 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7]
6063 ; AVX512-FCP-NEXT: vpsrlq $48, %zmm4, %zmm4
6064 ; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm4
6065 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
6066 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
6067 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm4
6068 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
6069 ; AVX512-FCP-NEXT: vpsrlq $48, %zmm2, %zmm2
6070 ; AVX512-FCP-NEXT: vpmovqw %zmm2, %xmm2
6071 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6072 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
6073 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm2
6074 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm3
6075 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
6076 ; AVX512-FCP-NEXT: vpsrlq $48, %zmm25, %zmm3
6077 ; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3
6078 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
6079 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1
6080 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm3
6081 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
6082 ; AVX512-FCP-NEXT: vpsrlq $48, %zmm22, %zmm3
6083 ; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3
6084 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
6085 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[0,1,2,3]
6086 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 64(%rsi)
6087 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%rsi)
6088 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx)
6089 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, (%rdx)
6090 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%rcx)
6091 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%rcx)
6092 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%r8)
6093 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
6094 ; AVX512-FCP-NEXT: vzeroupper
6095 ; AVX512-FCP-NEXT: retq
6097 ; AVX512DQ-LABEL: load_i16_stride4_vf64:
6098 ; AVX512DQ: # %bb.0:
6099 ; AVX512DQ-NEXT: subq $200, %rsp
6100 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm26
6101 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm27
6102 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm28
6103 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm29
6104 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm0
6105 ; AVX512DQ-NEXT: vpmovqw %ymm0, %xmm0
6106 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6107 ; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm14
6108 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3]
6109 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,0,2,4,5,6,7]
6110 ; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm13
6111 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[0,2,2,3]
6112 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7]
6113 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
6114 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6115 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6116 ; AVX512DQ-NEXT: vpmovqw %zmm29, %xmm1
6117 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6118 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm1
6119 ; AVX512DQ-NEXT: vpmovqw %ymm1, %xmm1
6120 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6121 ; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm12
6122 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3]
6123 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,0,2,4,5,6,7]
6124 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm11
6125 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3]
6126 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[0,1,0,2,4,5,6,7]
6127 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
6128 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
6129 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
6130 ; AVX512DQ-NEXT: vpmovqw %zmm28, %xmm4
6131 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
6132 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3]
6133 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6134 ; AVX512DQ-NEXT: vmovdqa 448(%rdi), %ymm0
6135 ; AVX512DQ-NEXT: vpmovqw %ymm0, %xmm0
6136 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6137 ; AVX512DQ-NEXT: vmovdqa64 496(%rdi), %xmm24
6138 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm24[0,2,2,3]
6139 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,0,2,4,5,6,7]
6140 ; AVX512DQ-NEXT: vmovdqa64 480(%rdi), %xmm23
6141 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm23[0,2,2,3]
6142 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,1,0,2,4,5,6,7]
6143 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
6144 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6145 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6146 ; AVX512DQ-NEXT: vpmovqw %zmm27, %xmm1
6147 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6148 ; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm1
6149 ; AVX512DQ-NEXT: vpmovqw %ymm1, %xmm1
6150 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6151 ; AVX512DQ-NEXT: vmovdqa64 368(%rdi), %xmm31
6152 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm31[0,2,2,3]
6153 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,0,2,4,5,6,7]
6154 ; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %xmm25
6155 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm25[0,2,2,3]
6156 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[0,1,0,2,4,5,6,7]
6157 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
6158 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6159 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
6160 ; AVX512DQ-NEXT: vpmovqw %zmm26, %xmm1
6161 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6162 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[0,1,2,3]
6163 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6164 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %xmm30
6165 ; AVX512DQ-NEXT: vmovdqa64 336(%rdi), %xmm17
6166 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %xmm18
6167 ; AVX512DQ-NEXT: vmovdqa64 464(%rdi), %xmm19
6168 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %xmm20
6169 ; AVX512DQ-NEXT: vmovdqa64 80(%rdi), %xmm21
6170 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm0
6171 ; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm1
6172 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,1,3,4,5,6,7]
6173 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
6174 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
6175 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6176 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,2,2,3]
6177 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
6178 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,2,2,3]
6179 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
6180 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
6181 ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
6182 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
6183 ; AVX512DQ-NEXT: vpsrlq $16, %zmm29, %zmm6
6184 ; AVX512DQ-NEXT: vpmovqw %zmm6, %xmm6
6185 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
6186 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
6187 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,1,3,4,5,6,7]
6188 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
6189 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
6190 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm21[0,2,2,3]
6191 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
6192 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm20[0,2,2,3]
6193 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
6194 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
6195 ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
6196 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
6197 ; AVX512DQ-NEXT: vpsrlq $16, %zmm28, %zmm6
6198 ; AVX512DQ-NEXT: vpmovqw %zmm6, %xmm6
6199 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
6200 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,2,3]
6201 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
6202 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,1,3,4,5,6,7]
6203 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,1,3,4,5,6,7]
6204 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
6205 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6206 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm19[0,2,2,3]
6207 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
6208 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm18[0,2,2,3]
6209 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
6210 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
6211 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
6212 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
6213 ; AVX512DQ-NEXT: vpsrlq $16, %zmm27, %zmm3
6214 ; AVX512DQ-NEXT: vpmovqw %zmm3, %xmm3
6215 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
6216 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,1,3,4,5,6,7]
6217 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,1,3,4,5,6,7]
6218 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
6219 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
6220 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm17[0,2,2,3]
6221 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
6222 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm30[0,2,2,3]
6223 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
6224 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
6225 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
6226 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
6227 ; AVX512DQ-NEXT: vpsrlq $16, %zmm26, %zmm4
6228 ; AVX512DQ-NEXT: vpmovqw %zmm4, %xmm4
6229 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
6230 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,2,3]
6231 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6232 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3]
6233 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,2,0,4,5,6,7]
6234 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm22
6235 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3]
6236 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,2,0,4,5,6,7]
6237 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
6238 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
6239 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[3,1,2,3]
6240 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,0,2,3,4,5,6,7]
6241 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,1,2,3]
6242 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7]
6243 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
6244 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6245 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
6246 ; AVX512DQ-NEXT: vpsrlq $32, %zmm29, %zmm1
6247 ; AVX512DQ-NEXT: vpmovqw %zmm1, %xmm1
6248 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6249 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,1,2,3]
6250 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7]
6251 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3]
6252 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,2,0,4,5,6,7]
6253 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
6254 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
6255 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm21[3,1,2,3]
6256 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,0,2,3,4,5,6,7]
6257 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm21
6258 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm20[3,1,2,3]
6259 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,0,2,3,4,5,6,7]
6260 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm16
6261 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
6262 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
6263 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
6264 ; AVX512DQ-NEXT: vpsrlq $32, %zmm28, %zmm8
6265 ; AVX512DQ-NEXT: vpmovqw %zmm8, %xmm8
6266 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
6267 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm1[0,1,2,3]
6268 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6269 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm24[3,1,2,3]
6270 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7]
6271 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm23[3,1,2,3]
6272 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,2,0,4,5,6,7]
6273 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm20 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
6274 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm19[3,1,2,3]
6275 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,0,2,3,4,5,6,7]
6276 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm18[3,1,2,3]
6277 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7]
6278 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm18 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
6279 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm31[3,1,2,3]
6280 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,1,2,0,4,5,6,7]
6281 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm25[3,1,2,3]
6282 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[0,1,2,0,4,5,6,7]
6283 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm19 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
6284 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm17[3,1,2,3]
6285 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[2,0,2,3,4,5,6,7]
6286 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm30[3,1,2,3]
6287 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm9[2,0,2,3,4,5,6,7]
6288 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
6289 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm14
6290 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm5
6291 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7]
6292 ; AVX512DQ-NEXT: vpsrlq $32, %zmm27, %zmm14
6293 ; AVX512DQ-NEXT: vpmovqw %zmm14, %xmm14
6294 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
6295 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm14
6296 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6297 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7]
6298 ; AVX512DQ-NEXT: vpsrlq $32, %zmm26, %zmm14
6299 ; AVX512DQ-NEXT: vpmovqw %zmm14, %xmm14
6300 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
6301 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm0[0,1,2,3],zmm5[0,1,2,3]
6302 ; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm0
6303 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
6304 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[0,1,3,1,4,5,6,7]
6305 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
6306 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[3,1,2,3,4,5,6,7]
6307 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7]
6308 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
6309 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
6310 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
6311 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
6312 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6313 ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3
6314 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
6315 ; AVX512DQ-NEXT: vpsrlq $48, %zmm29, %zmm3
6316 ; AVX512DQ-NEXT: vpmovqw %zmm3, %xmm3
6317 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
6318 ; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm3
6319 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
6320 ; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm5
6321 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
6322 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
6323 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6324 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
6325 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
6326 ; AVX512DQ-NEXT: vpsrlq $48, %zmm28, %zmm3
6327 ; AVX512DQ-NEXT: vpmovqw %zmm3, %xmm3
6328 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
6329 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,2,3]
6330 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,1,3,1,4,5,6,7]
6331 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,3,1,4,5,6,7]
6332 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
6333 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7]
6334 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
6335 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
6336 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,3,1,4,5,6,7]
6337 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,3,1,4,5,6,7]
6338 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
6339 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6340 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6341 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
6342 ; AVX512DQ-NEXT: vpsrlq $48, %zmm27, %zmm2
6343 ; AVX512DQ-NEXT: vpmovqw %zmm2, %xmm2
6344 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6345 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[3,1,2,3,4,5,6,7]
6346 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7]
6347 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
6348 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
6349 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6350 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
6351 ; AVX512DQ-NEXT: vpsrlq $48, %zmm26, %zmm3
6352 ; AVX512DQ-NEXT: vpmovqw %zmm3, %xmm3
6353 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
6354 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[0,1,2,3]
6355 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6356 ; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rsi)
6357 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6358 ; AVX512DQ-NEXT: vmovaps %zmm2, (%rsi)
6359 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6360 ; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rdx)
6361 ; AVX512DQ-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload
6362 ; AVX512DQ-NEXT: vmovaps %zmm2, (%rdx)
6363 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 64(%rcx)
6364 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6365 ; AVX512DQ-NEXT: vmovaps %zmm2, (%rcx)
6366 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%r8)
6367 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8)
6368 ; AVX512DQ-NEXT: addq $200, %rsp
6369 ; AVX512DQ-NEXT: vzeroupper
6370 ; AVX512DQ-NEXT: retq
6372 ; AVX512DQ-FCP-LABEL: load_i16_stride4_vf64:
6373 ; AVX512DQ-FCP: # %bb.0:
6374 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22
6375 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm25
6376 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
6377 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
6378 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
6379 ; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1]
6380 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,2,2,3,0,2,4,6]
6381 ; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm23
6382 ; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm9, %ymm10
6383 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm0
6384 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
6385 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm24
6386 ; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm9, %ymm3
6387 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm1
6388 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,2,2,3,4,6,12,14]
6389 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm7, %ymm1
6390 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm0
6391 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6392 ; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm26
6393 ; AVX512DQ-FCP-NEXT: vpermd %ymm26, %ymm9, %ymm8
6394 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm11
6395 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27
6396 ; AVX512DQ-FCP-NEXT: vpermd %ymm27, %ymm9, %ymm0
6397 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm12
6398 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm11, %ymm7, %ymm12
6399 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm2, %xmm11
6400 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
6401 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm11[0,1,2,3],zmm1[0,1,2,3]
6402 ; AVX512DQ-FCP-NEXT: vmovdqa64 480(%rdi), %ymm28
6403 ; AVX512DQ-FCP-NEXT: vpermd %ymm28, %ymm9, %ymm11
6404 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm1
6405 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %ymm17
6406 ; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm9, %ymm12
6407 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm13
6408 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm7, %ymm13
6409 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm25, %xmm1
6410 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm13[4,5,6,7]
6411 ; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm18
6412 ; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm9, %ymm1
6413 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm15
6414 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm19
6415 ; AVX512DQ-FCP-NEXT: vpermd %ymm19, %ymm9, %ymm13
6416 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm9
6417 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm15, %ymm7, %ymm9
6418 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm22, %xmm15
6419 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7]
6420 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm9[0,1,2,3],zmm14[0,1,2,3]
6421 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
6422 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm10
6423 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
6424 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm10[6,7]
6425 ; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm4, %zmm10
6426 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm10, %xmm10
6427 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7]
6428 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm8
6429 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
6430 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7]
6431 ; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm2, %zmm8
6432 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm8, %xmm8
6433 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
6434 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm0[0,1,2,3],zmm3[0,1,2,3]
6435 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm0
6436 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm3
6437 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
6438 ; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm25, %zmm3
6439 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3
6440 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
6441 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1
6442 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm3
6443 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
6444 ; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm22, %zmm3
6445 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3
6446 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
6447 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm1[0,1,2,3],zmm0[0,1,2,3]
6448 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,3,2,3,1,3,5,7]
6449 ; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm14, %ymm3
6450 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm0
6451 ; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm14, %ymm8
6452 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm1
6453 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm7, %ymm1
6454 ; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm4, %zmm0
6455 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm0, %xmm0
6456 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6457 ; AVX512DQ-FCP-NEXT: vpermd %ymm26, %ymm14, %ymm0
6458 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm12
6459 ; AVX512DQ-FCP-NEXT: vpermd %ymm27, %ymm14, %ymm11
6460 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm13
6461 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm12, %ymm7, %ymm13
6462 ; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm2, %zmm12
6463 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm12, %xmm12
6464 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
6465 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm12[0,1,2,3],zmm1[0,1,2,3]
6466 ; AVX512DQ-FCP-NEXT: vpermd %ymm28, %ymm14, %ymm12
6467 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm1
6468 ; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm14, %ymm13
6469 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm10
6470 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm7, %ymm10
6471 ; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm25, %zmm1
6472 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm1, %xmm1
6473 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7]
6474 ; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm14, %ymm1
6475 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm15
6476 ; AVX512DQ-FCP-NEXT: vpermd %ymm19, %ymm14, %ymm5
6477 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm6
6478 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm15, %ymm7, %ymm6
6479 ; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm22, %zmm7
6480 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm7, %xmm7
6481 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
6482 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm10[0,1,2,3]
6483 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
6484 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm7
6485 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7]
6486 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm4, %zmm4
6487 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm4
6488 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
6489 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
6490 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm4
6491 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
6492 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm2, %zmm2
6493 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm2, %xmm2
6494 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6495 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
6496 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm2
6497 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm3
6498 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
6499 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm25, %zmm3
6500 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3
6501 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
6502 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1
6503 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm3
6504 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
6505 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm22, %zmm3
6506 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3
6507 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
6508 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[0,1,2,3]
6509 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 64(%rsi)
6510 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rsi)
6511 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx)
6512 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, (%rdx)
6513 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 64(%rcx)
6514 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%rcx)
6515 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%r8)
6516 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
6517 ; AVX512DQ-FCP-NEXT: vzeroupper
6518 ; AVX512DQ-FCP-NEXT: retq
6520 ; AVX512BW-LABEL: load_i16_stride4_vf64:
6521 ; AVX512BW: # %bb.0:
6522 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
6523 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
6524 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2
6525 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3
6526 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4
6527 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5
6528 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm6
6529 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm7
6530 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
6531 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6532 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9
6533 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm8, %zmm9
6534 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10
6535 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm10
6536 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7]
6537 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10
6538 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm8, %zmm10
6539 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
6540 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7]
6541 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
6542 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
6543 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11
6544 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm10, %zmm11
6545 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12
6546 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm12
6547 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7]
6548 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12
6549 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm10, %zmm12
6550 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
6551 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7]
6552 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
6553 ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
6554 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13
6555 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm12, %zmm13
6556 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14
6557 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm12, %zmm14
6558 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7]
6559 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14
6560 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm12, %zmm14
6561 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm12
6562 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7]
6563 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
6564 ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
6565 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm14, %zmm7
6566 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm14, %zmm5
6567 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
6568 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm14, %zmm2
6569 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm14, %zmm0
6570 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
6571 ; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rsi)
6572 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi)
6573 ; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rdx)
6574 ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rdx)
6575 ; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rcx)
6576 ; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rcx)
6577 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r8)
6578 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r8)
6579 ; AVX512BW-NEXT: vzeroupper
6580 ; AVX512BW-NEXT: retq
6582 ; AVX512BW-FCP-LABEL: load_i16_stride4_vf64:
6583 ; AVX512BW-FCP: # %bb.0:
6584 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
6585 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
6586 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
6587 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
6588 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4
6589 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5
6590 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6
6591 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7
6592 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
6593 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6594 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9
6595 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm8, %zmm9
6596 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10
6597 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm10
6598 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7]
6599 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
6600 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm8, %zmm10
6601 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
6602 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7]
6603 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
6604 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
6605 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11
6606 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm10, %zmm11
6607 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12
6608 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm12
6609 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7]
6610 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
6611 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm10, %zmm12
6612 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
6613 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7]
6614 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
6615 ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
6616 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13
6617 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm12, %zmm13
6618 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14
6619 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm12, %zmm14
6620 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7]
6621 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14
6622 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm12, %zmm14
6623 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm12
6624 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7]
6625 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
6626 ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
6627 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm14, %zmm7
6628 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm14, %zmm5
6629 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
6630 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm14, %zmm2
6631 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm14, %zmm0
6632 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
6633 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi)
6634 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi)
6635 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx)
6636 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx)
6637 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx)
6638 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx)
6639 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8)
6640 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
6641 ; AVX512BW-FCP-NEXT: vzeroupper
6642 ; AVX512BW-FCP-NEXT: retq
6644 ; AVX512DQ-BW-LABEL: load_i16_stride4_vf64:
6645 ; AVX512DQ-BW: # %bb.0:
6646 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
6647 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
6648 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2
6649 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3
6650 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm4
6651 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm5
6652 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm6
6653 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm7
6654 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
6655 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6656 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9
6657 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm8, %zmm9
6658 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10
6659 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm10
6660 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7]
6661 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10
6662 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm8, %zmm10
6663 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
6664 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7]
6665 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
6666 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
6667 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm11
6668 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm10, %zmm11
6669 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm12
6670 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm12
6671 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7]
6672 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12
6673 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm10, %zmm12
6674 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
6675 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7]
6676 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
6677 ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
6678 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13
6679 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm12, %zmm13
6680 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm14
6681 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm12, %zmm14
6682 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7]
6683 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm14
6684 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm12, %zmm14
6685 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm12
6686 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7]
6687 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
6688 ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
6689 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm14, %zmm7
6690 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm14, %zmm5
6691 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
6692 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm14, %zmm2
6693 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm14, %zmm0
6694 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
6695 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 64(%rsi)
6696 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rsi)
6697 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 64(%rdx)
6698 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rdx)
6699 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%rcx)
6700 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%rcx)
6701 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%r8)
6702 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r8)
6703 ; AVX512DQ-BW-NEXT: vzeroupper
6704 ; AVX512DQ-BW-NEXT: retq
6706 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride4_vf64:
6707 ; AVX512DQ-BW-FCP: # %bb.0:
6708 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
6709 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
6710 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
6711 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
6712 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4
6713 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5
6714 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6
6715 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7
6716 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
6717 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6718 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9
6719 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm8, %zmm9
6720 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10
6721 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm10
6722 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7]
6723 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
6724 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm8, %zmm10
6725 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
6726 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7]
6727 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
6728 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
6729 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11
6730 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm10, %zmm11
6731 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12
6732 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm12
6733 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7]
6734 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
6735 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm10, %zmm12
6736 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
6737 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7]
6738 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
6739 ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
6740 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13
6741 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm12, %zmm13
6742 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14
6743 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm12, %zmm14
6744 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7]
6745 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14
6746 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm12, %zmm14
6747 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm12
6748 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7]
6749 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
6750 ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
6751 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm14, %zmm7
6752 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm14, %zmm5
6753 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
6754 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm14, %zmm2
6755 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm14, %zmm0
6756 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
6757 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi)
6758 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi)
6759 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx)
6760 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx)
6761 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx)
6762 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx)
6763 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8)
6764 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
6765 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
6766 ; AVX512DQ-BW-FCP-NEXT: retq
6767 %wide.vec = load <256 x i16>, ptr %in.vec, align 64
6768 %strided.vec0 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <64 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 64, i32 68, i32 72, i32 76, i32 80, i32 84, i32 88, i32 92, i32 96, i32 100, i32 104, i32 108, i32 112, i32 116, i32 120, i32 124, i32 128, i32 132, i32 136, i32 140, i32 144, i32 148, i32 152, i32 156, i32 160, i32 164, i32 168, i32 172, i32 176, i32 180, i32 184, i32 188, i32 192, i32 196, i32 200, i32 204, i32 208, i32 212, i32 216, i32 220, i32 224, i32 228, i32 232, i32 236, i32 240, i32 244, i32 248, i32 252>
6769 %strided.vec1 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <64 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 65, i32 69, i32 73, i32 77, i32 81, i32 85, i32 89, i32 93, i32 97, i32 101, i32 105, i32 109, i32 113, i32 117, i32 121, i32 125, i32 129, i32 133, i32 137, i32 141, i32 145, i32 149, i32 153, i32 157, i32 161, i32 165, i32 169, i32 173, i32 177, i32 181, i32 185, i32 189, i32 193, i32 197, i32 201, i32 205, i32 209, i32 213, i32 217, i32 221, i32 225, i32 229, i32 233, i32 237, i32 241, i32 245, i32 249, i32 253>
6770 %strided.vec2 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <64 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 66, i32 70, i32 74, i32 78, i32 82, i32 86, i32 90, i32 94, i32 98, i32 102, i32 106, i32 110, i32 114, i32 118, i32 122, i32 126, i32 130, i32 134, i32 138, i32 142, i32 146, i32 150, i32 154, i32 158, i32 162, i32 166, i32 170, i32 174, i32 178, i32 182, i32 186, i32 190, i32 194, i32 198, i32 202, i32 206, i32 210, i32 214, i32 218, i32 222, i32 226, i32 230, i32 234, i32 238, i32 242, i32 246, i32 250, i32 254>
6771 %strided.vec3 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <64 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63, i32 67, i32 71, i32 75, i32 79, i32 83, i32 87, i32 91, i32 95, i32 99, i32 103, i32 107, i32 111, i32 115, i32 119, i32 123, i32 127, i32 131, i32 135, i32 139, i32 143, i32 147, i32 151, i32 155, i32 159, i32 163, i32 167, i32 171, i32 175, i32 179, i32 183, i32 187, i32 191, i32 195, i32 199, i32 203, i32 207, i32 211, i32 215, i32 219, i32 223, i32 227, i32 231, i32 235, i32 239, i32 243, i32 247, i32 251, i32 255>
6772 store <64 x i16> %strided.vec0, ptr %out.vec0, align 64
6773 store <64 x i16> %strided.vec1, ptr %out.vec1, align 64
6774 store <64 x i16> %strided.vec2, ptr %out.vec2, align 64
6775 store <64 x i16> %strided.vec3, ptr %out.vec3, align 64