1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
3 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
4 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
5 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
6 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
7 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
8 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
9 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
10 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
11 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
12 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
13 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
14 ; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
16 ; These patterns are produced by LoopVectorizer for interleaved loads.
18 define void @load_i16_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
19 ; SSE-LABEL: load_i16_stride4_vf2:
21 ; SSE-NEXT: movdqa (%rdi), %xmm0
22 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
23 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
24 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
25 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
26 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7]
27 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
28 ; SSE-NEXT: movd %xmm2, (%rsi)
29 ; SSE-NEXT: movd %xmm1, (%rdx)
30 ; SSE-NEXT: movd %xmm3, (%rcx)
31 ; SSE-NEXT: movd %xmm0, (%r8)
34 ; AVX-LABEL: load_i16_stride4_vf2:
36 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
37 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
38 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
39 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
40 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
41 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7]
42 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
43 ; AVX-NEXT: vmovd %xmm2, (%rsi)
44 ; AVX-NEXT: vmovd %xmm1, (%rdx)
45 ; AVX-NEXT: vmovd %xmm3, (%rcx)
46 ; AVX-NEXT: vmovd %xmm0, (%r8)
49 ; AVX2-LABEL: load_i16_stride4_vf2:
51 ; AVX2-NEXT: vmovdqa (%rdi), %xmm0
52 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
53 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
54 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
55 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
56 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7]
57 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
58 ; AVX2-NEXT: vmovd %xmm2, (%rsi)
59 ; AVX2-NEXT: vmovd %xmm1, (%rdx)
60 ; AVX2-NEXT: vmovd %xmm3, (%rcx)
61 ; AVX2-NEXT: vmovd %xmm0, (%r8)
64 ; AVX2-FP-LABEL: load_i16_stride4_vf2:
66 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
67 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
68 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
69 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
70 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
71 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7]
72 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
73 ; AVX2-FP-NEXT: vmovd %xmm2, (%rsi)
74 ; AVX2-FP-NEXT: vmovd %xmm1, (%rdx)
75 ; AVX2-FP-NEXT: vmovd %xmm3, (%rcx)
76 ; AVX2-FP-NEXT: vmovd %xmm0, (%r8)
79 ; AVX2-FCP-LABEL: load_i16_stride4_vf2:
81 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
82 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
83 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
84 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
85 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
86 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7]
87 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
88 ; AVX2-FCP-NEXT: vmovd %xmm2, (%rsi)
89 ; AVX2-FCP-NEXT: vmovd %xmm1, (%rdx)
90 ; AVX2-FCP-NEXT: vmovd %xmm3, (%rcx)
91 ; AVX2-FCP-NEXT: vmovd %xmm0, (%r8)
94 ; AVX512-LABEL: load_i16_stride4_vf2:
96 ; AVX512-NEXT: vmovdqa (%rdi), %xmm0
97 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
98 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
99 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
100 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
101 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
102 ; AVX512-NEXT: vpmovqw %xmm0, (%rsi)
103 ; AVX512-NEXT: vmovd %xmm1, (%rdx)
104 ; AVX512-NEXT: vmovd %xmm3, (%rcx)
105 ; AVX512-NEXT: vmovd %xmm2, (%r8)
108 ; AVX512-FCP-LABEL: load_i16_stride4_vf2:
109 ; AVX512-FCP: # %bb.0:
110 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
111 ; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
112 ; AVX512-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
113 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
114 ; AVX512-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
115 ; AVX512-FCP-NEXT: vpmovqw %xmm0, (%rsi)
116 ; AVX512-FCP-NEXT: vmovd %xmm1, (%rdx)
117 ; AVX512-FCP-NEXT: vmovd %xmm3, (%rcx)
118 ; AVX512-FCP-NEXT: vmovd %xmm2, (%r8)
119 ; AVX512-FCP-NEXT: retq
121 ; AVX512DQ-LABEL: load_i16_stride4_vf2:
123 ; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
124 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
125 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
126 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
127 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
128 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
129 ; AVX512DQ-NEXT: vpmovqw %xmm0, (%rsi)
130 ; AVX512DQ-NEXT: vmovd %xmm1, (%rdx)
131 ; AVX512DQ-NEXT: vmovd %xmm3, (%rcx)
132 ; AVX512DQ-NEXT: vmovd %xmm2, (%r8)
133 ; AVX512DQ-NEXT: retq
135 ; AVX512DQ-FCP-LABEL: load_i16_stride4_vf2:
136 ; AVX512DQ-FCP: # %bb.0:
137 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
138 ; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
139 ; AVX512DQ-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
140 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
141 ; AVX512DQ-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
142 ; AVX512DQ-FCP-NEXT: vpmovqw %xmm0, (%rsi)
143 ; AVX512DQ-FCP-NEXT: vmovd %xmm1, (%rdx)
144 ; AVX512DQ-FCP-NEXT: vmovd %xmm3, (%rcx)
145 ; AVX512DQ-FCP-NEXT: vmovd %xmm2, (%r8)
146 ; AVX512DQ-FCP-NEXT: retq
148 ; AVX512BW-LABEL: load_i16_stride4_vf2:
150 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
151 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
152 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
153 ; AVX512BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
154 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
155 ; AVX512BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
156 ; AVX512BW-NEXT: vpmovqw %xmm0, (%rsi)
157 ; AVX512BW-NEXT: vmovd %xmm1, (%rdx)
158 ; AVX512BW-NEXT: vmovd %xmm3, (%rcx)
159 ; AVX512BW-NEXT: vmovd %xmm2, (%r8)
160 ; AVX512BW-NEXT: retq
162 ; AVX512BW-FCP-LABEL: load_i16_stride4_vf2:
163 ; AVX512BW-FCP: # %bb.0:
164 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
165 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
166 ; AVX512BW-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
167 ; AVX512BW-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
168 ; AVX512BW-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
169 ; AVX512BW-FCP-NEXT: vpmovqw %xmm0, (%rsi)
170 ; AVX512BW-FCP-NEXT: vmovd %xmm1, (%rdx)
171 ; AVX512BW-FCP-NEXT: vmovd %xmm3, (%rcx)
172 ; AVX512BW-FCP-NEXT: vmovd %xmm2, (%r8)
173 ; AVX512BW-FCP-NEXT: retq
175 ; AVX512DQ-BW-LABEL: load_i16_stride4_vf2:
176 ; AVX512DQ-BW: # %bb.0:
177 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
178 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
179 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
180 ; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
181 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
182 ; AVX512DQ-BW-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
183 ; AVX512DQ-BW-NEXT: vpmovqw %xmm0, (%rsi)
184 ; AVX512DQ-BW-NEXT: vmovd %xmm1, (%rdx)
185 ; AVX512DQ-BW-NEXT: vmovd %xmm3, (%rcx)
186 ; AVX512DQ-BW-NEXT: vmovd %xmm2, (%r8)
187 ; AVX512DQ-BW-NEXT: retq
189 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride4_vf2:
190 ; AVX512DQ-BW-FCP: # %bb.0:
191 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
192 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
193 ; AVX512DQ-BW-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
194 ; AVX512DQ-BW-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
195 ; AVX512DQ-BW-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
196 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %xmm0, (%rsi)
197 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm1, (%rdx)
198 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm3, (%rcx)
199 ; AVX512DQ-BW-FCP-NEXT: vmovd %xmm2, (%r8)
200 ; AVX512DQ-BW-FCP-NEXT: retq
201 %wide.vec = load <8 x i16>, ptr %in.vec, align 64
202 %strided.vec0 = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <2 x i32> <i32 0, i32 4>
203 %strided.vec1 = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <2 x i32> <i32 1, i32 5>
204 %strided.vec2 = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <2 x i32> <i32 2, i32 6>
205 %strided.vec3 = shufflevector <8 x i16> %wide.vec, <8 x i16> poison, <2 x i32> <i32 3, i32 7>
206 store <2 x i16> %strided.vec0, ptr %out.vec0, align 64
207 store <2 x i16> %strided.vec1, ptr %out.vec1, align 64
208 store <2 x i16> %strided.vec2, ptr %out.vec2, align 64
209 store <2 x i16> %strided.vec3, ptr %out.vec3, align 64
213 define void @load_i16_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
214 ; SSE-LABEL: load_i16_stride4_vf4:
216 ; SSE-NEXT: movdqa (%rdi), %xmm0
217 ; SSE-NEXT: movdqa 16(%rdi), %xmm1
218 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
219 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[0,2,2,3,4,5,6,7]
220 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
221 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,2,2,3,4,5,6,7]
222 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
223 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
224 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[1,3,2,3,4,5,6,7]
225 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
226 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
227 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7]
228 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
229 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm0[2,0,2,3,4,5,6,7]
230 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
231 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
232 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
233 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
234 ; SSE-NEXT: movq %xmm5, (%rsi)
235 ; SSE-NEXT: movq %xmm3, (%rdx)
236 ; SSE-NEXT: movq %xmm4, (%rcx)
237 ; SSE-NEXT: movq %xmm0, (%r8)
240 ; AVX-LABEL: load_i16_stride4_vf4:
242 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
243 ; AVX-NEXT: vmovdqa (%rdi), %xmm1
244 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm2
245 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7]
246 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7]
247 ; AVX-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
248 ; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
249 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
250 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
251 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3]
252 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
253 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
254 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
255 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7]
256 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
257 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7]
258 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
259 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
260 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
261 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
262 ; AVX-NEXT: vmovq %xmm0, (%rsi)
263 ; AVX-NEXT: vmovq %xmm3, (%rdx)
264 ; AVX-NEXT: vmovq %xmm4, (%rcx)
265 ; AVX-NEXT: vmovq %xmm1, (%r8)
268 ; AVX2-LABEL: load_i16_stride4_vf4:
270 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
271 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
272 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2
273 ; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7]
274 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7]
275 ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
276 ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
277 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
278 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
279 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[0,2,2,3]
280 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
281 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
282 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
283 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7]
284 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
285 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7]
286 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
287 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
288 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
289 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
290 ; AVX2-NEXT: vmovq %xmm0, (%rsi)
291 ; AVX2-NEXT: vmovq %xmm3, (%rdx)
292 ; AVX2-NEXT: vmovq %xmm4, (%rcx)
293 ; AVX2-NEXT: vmovq %xmm1, (%r8)
296 ; AVX2-FP-LABEL: load_i16_stride4_vf4:
298 ; AVX2-FP-NEXT: vpxor %xmm0, %xmm0, %xmm0
299 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1
300 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm2
301 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7]
302 ; AVX2-FP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7]
303 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
304 ; AVX2-FP-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
305 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
306 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm4
307 ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm3
308 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
309 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
310 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7]
311 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
312 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7]
313 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
314 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
315 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
316 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
317 ; AVX2-FP-NEXT: vmovq %xmm0, (%rsi)
318 ; AVX2-FP-NEXT: vmovq %xmm3, (%rdx)
319 ; AVX2-FP-NEXT: vmovq %xmm4, (%rcx)
320 ; AVX2-FP-NEXT: vmovq %xmm1, (%r8)
323 ; AVX2-FCP-LABEL: load_i16_stride4_vf4:
325 ; AVX2-FCP-NEXT: vpxor %xmm0, %xmm0, %xmm0
326 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1
327 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
328 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7]
329 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7]
330 ; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
331 ; AVX2-FCP-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
332 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
333 ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm4
334 ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm3
335 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
336 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
337 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[2,0,2,3,4,5,6,7]
338 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
339 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[2,0,2,3,4,5,6,7]
340 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
341 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
342 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
343 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
344 ; AVX2-FCP-NEXT: vmovq %xmm0, (%rsi)
345 ; AVX2-FCP-NEXT: vmovq %xmm3, (%rdx)
346 ; AVX2-FCP-NEXT: vmovq %xmm4, (%rcx)
347 ; AVX2-FCP-NEXT: vmovq %xmm1, (%r8)
348 ; AVX2-FCP-NEXT: retq
350 ; AVX512-LABEL: load_i16_stride4_vf4:
352 ; AVX512-NEXT: vmovdqa (%rdi), %ymm0
353 ; AVX512-NEXT: vpsrlq $16, %ymm0, %ymm1
354 ; AVX512-NEXT: vpsrlq $32, %ymm0, %ymm2
355 ; AVX512-NEXT: vpsrlq $48, %ymm0, %ymm3
356 ; AVX512-NEXT: vpmovqw %ymm0, (%rsi)
357 ; AVX512-NEXT: vpmovqw %ymm1, (%rdx)
358 ; AVX512-NEXT: vpmovqw %ymm2, (%rcx)
359 ; AVX512-NEXT: vpmovqw %ymm3, (%r8)
360 ; AVX512-NEXT: vzeroupper
363 ; AVX512-FCP-LABEL: load_i16_stride4_vf4:
364 ; AVX512-FCP: # %bb.0:
365 ; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0
366 ; AVX512-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1
367 ; AVX512-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2
368 ; AVX512-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3
369 ; AVX512-FCP-NEXT: vpmovqw %ymm0, (%rsi)
370 ; AVX512-FCP-NEXT: vpmovqw %ymm1, (%rdx)
371 ; AVX512-FCP-NEXT: vpmovqw %ymm2, (%rcx)
372 ; AVX512-FCP-NEXT: vpmovqw %ymm3, (%r8)
373 ; AVX512-FCP-NEXT: vzeroupper
374 ; AVX512-FCP-NEXT: retq
376 ; AVX512DQ-LABEL: load_i16_stride4_vf4:
378 ; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
379 ; AVX512DQ-NEXT: vpsrlq $16, %ymm0, %ymm1
380 ; AVX512DQ-NEXT: vpsrlq $32, %ymm0, %ymm2
381 ; AVX512DQ-NEXT: vpsrlq $48, %ymm0, %ymm3
382 ; AVX512DQ-NEXT: vpmovqw %ymm0, (%rsi)
383 ; AVX512DQ-NEXT: vpmovqw %ymm1, (%rdx)
384 ; AVX512DQ-NEXT: vpmovqw %ymm2, (%rcx)
385 ; AVX512DQ-NEXT: vpmovqw %ymm3, (%r8)
386 ; AVX512DQ-NEXT: vzeroupper
387 ; AVX512DQ-NEXT: retq
389 ; AVX512DQ-FCP-LABEL: load_i16_stride4_vf4:
390 ; AVX512DQ-FCP: # %bb.0:
391 ; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0
392 ; AVX512DQ-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1
393 ; AVX512DQ-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2
394 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3
395 ; AVX512DQ-FCP-NEXT: vpmovqw %ymm0, (%rsi)
396 ; AVX512DQ-FCP-NEXT: vpmovqw %ymm1, (%rdx)
397 ; AVX512DQ-FCP-NEXT: vpmovqw %ymm2, (%rcx)
398 ; AVX512DQ-FCP-NEXT: vpmovqw %ymm3, (%r8)
399 ; AVX512DQ-FCP-NEXT: vzeroupper
400 ; AVX512DQ-FCP-NEXT: retq
402 ; AVX512BW-LABEL: load_i16_stride4_vf4:
404 ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm0
405 ; AVX512BW-NEXT: vpsrlq $16, %ymm0, %ymm1
406 ; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm2
407 ; AVX512BW-NEXT: vpsrlq $48, %ymm0, %ymm3
408 ; AVX512BW-NEXT: vpmovqw %ymm0, (%rsi)
409 ; AVX512BW-NEXT: vpmovqw %ymm1, (%rdx)
410 ; AVX512BW-NEXT: vpmovqw %ymm2, (%rcx)
411 ; AVX512BW-NEXT: vpmovqw %ymm3, (%r8)
412 ; AVX512BW-NEXT: vzeroupper
413 ; AVX512BW-NEXT: retq
415 ; AVX512BW-FCP-LABEL: load_i16_stride4_vf4:
416 ; AVX512BW-FCP: # %bb.0:
417 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
418 ; AVX512BW-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1
419 ; AVX512BW-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2
420 ; AVX512BW-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3
421 ; AVX512BW-FCP-NEXT: vpmovqw %ymm0, (%rsi)
422 ; AVX512BW-FCP-NEXT: vpmovqw %ymm1, (%rdx)
423 ; AVX512BW-FCP-NEXT: vpmovqw %ymm2, (%rcx)
424 ; AVX512BW-FCP-NEXT: vpmovqw %ymm3, (%r8)
425 ; AVX512BW-FCP-NEXT: vzeroupper
426 ; AVX512BW-FCP-NEXT: retq
428 ; AVX512DQ-BW-LABEL: load_i16_stride4_vf4:
429 ; AVX512DQ-BW: # %bb.0:
430 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm0
431 ; AVX512DQ-BW-NEXT: vpsrlq $16, %ymm0, %ymm1
432 ; AVX512DQ-BW-NEXT: vpsrlq $32, %ymm0, %ymm2
433 ; AVX512DQ-BW-NEXT: vpsrlq $48, %ymm0, %ymm3
434 ; AVX512DQ-BW-NEXT: vpmovqw %ymm0, (%rsi)
435 ; AVX512DQ-BW-NEXT: vpmovqw %ymm1, (%rdx)
436 ; AVX512DQ-BW-NEXT: vpmovqw %ymm2, (%rcx)
437 ; AVX512DQ-BW-NEXT: vpmovqw %ymm3, (%r8)
438 ; AVX512DQ-BW-NEXT: vzeroupper
439 ; AVX512DQ-BW-NEXT: retq
441 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride4_vf4:
442 ; AVX512DQ-BW-FCP: # %bb.0:
443 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm0
444 ; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %ymm0, %ymm1
445 ; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %ymm0, %ymm2
446 ; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %ymm0, %ymm3
447 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm0, (%rsi)
448 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm1, (%rdx)
449 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm2, (%rcx)
450 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %ymm3, (%r8)
451 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
452 ; AVX512DQ-BW-FCP-NEXT: retq
453 %wide.vec = load <16 x i16>, ptr %in.vec, align 64
454 %strided.vec0 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
455 %strided.vec1 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
456 %strided.vec2 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
457 %strided.vec3 = shufflevector <16 x i16> %wide.vec, <16 x i16> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
458 store <4 x i16> %strided.vec0, ptr %out.vec0, align 64
459 store <4 x i16> %strided.vec1, ptr %out.vec1, align 64
460 store <4 x i16> %strided.vec2, ptr %out.vec2, align 64
461 store <4 x i16> %strided.vec3, ptr %out.vec3, align 64
465 define void @load_i16_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
466 ; SSE-LABEL: load_i16_stride4_vf8:
468 ; SSE-NEXT: movdqa (%rdi), %xmm2
469 ; SSE-NEXT: movdqa 16(%rdi), %xmm3
470 ; SSE-NEXT: movdqa 32(%rdi), %xmm1
471 ; SSE-NEXT: movdqa 48(%rdi), %xmm4
472 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[0,2,2,3]
473 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7]
474 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,2,2,3]
475 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm6[0,2,2,3,4,5,6,7]
476 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
477 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm4[0,2,2,3]
478 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm8[0,1,0,2,4,5,6,7]
479 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm1[0,2,2,3]
480 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,0,2,4,5,6,7]
481 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1]
482 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm7[0],xmm0[1]
483 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
484 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
485 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
486 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm8[0,1,1,3,4,5,6,7]
487 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm10[0,1,1,3,4,5,6,7]
488 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
489 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1]
490 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
491 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[2,0,2,3,4,5,6,7]
492 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
493 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm2[2,0,2,3,4,5,6,7]
494 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
495 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
496 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,1,2,0,4,5,6,7]
497 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
498 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,1,2,0,4,5,6,7]
499 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
500 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm6[0],xmm8[1]
501 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
502 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
503 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
504 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm4[0,1,3,1,4,5,6,7]
505 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
506 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
507 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
508 ; SSE-NEXT: movapd %xmm0, (%rsi)
509 ; SSE-NEXT: movapd %xmm7, (%rdx)
510 ; SSE-NEXT: movapd %xmm8, (%rcx)
511 ; SSE-NEXT: movapd %xmm1, (%r8)
514 ; AVX-LABEL: load_i16_stride4_vf8:
516 ; AVX-NEXT: vpxor %xmm0, %xmm0, %xmm0
517 ; AVX-NEXT: vmovdqa (%rdi), %xmm1
518 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm2
519 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm3
520 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm4
521 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm4[0],xmm0[1,2,3],xmm4[4],xmm0[5,6,7]
522 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm3[0],xmm0[1,2,3],xmm3[4],xmm0[5,6,7]
523 ; AVX-NEXT: vpackusdw %xmm5, %xmm6, %xmm5
524 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm2[0],xmm0[1,2,3],xmm2[4],xmm0[5,6,7]
525 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3],xmm1[4],xmm0[5,6,7]
526 ; AVX-NEXT: vpackusdw %xmm6, %xmm0, %xmm0
527 ; AVX-NEXT: vpackusdw %xmm5, %xmm0, %xmm0
528 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,2,2,3]
529 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
530 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,2,2,3]
531 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
532 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
533 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,2,2,3]
534 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
535 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,2,2,3]
536 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
537 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
538 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
539 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
540 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,2,0,4,5,6,7]
541 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
542 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,1,2,0,4,5,6,7]
543 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
544 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
545 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7]
546 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
547 ; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7]
548 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
549 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7]
550 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
551 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
552 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
553 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
554 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
555 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
556 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
557 ; AVX-NEXT: vmovdqa %xmm0, (%rsi)
558 ; AVX-NEXT: vmovdqa %xmm5, (%rdx)
559 ; AVX-NEXT: vmovdqa %xmm6, (%rcx)
560 ; AVX-NEXT: vmovdqa %xmm1, (%r8)
563 ; AVX2-LABEL: load_i16_stride4_vf8:
565 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
566 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
567 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
568 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
569 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
570 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
571 ; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
572 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
573 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
574 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2
575 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm3
576 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm4
577 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,2,2,3]
578 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
579 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[0,2,2,3]
580 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
581 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
582 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,2,2,3]
583 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
584 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[0,2,2,3]
585 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
586 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
587 ; AVX2-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
588 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
589 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,2,0,4,5,6,7]
590 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
591 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,1,2,0,4,5,6,7]
592 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
593 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
594 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7]
595 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
596 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7]
597 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
598 ; AVX2-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
599 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
600 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
601 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
602 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
603 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
604 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
605 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
606 ; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
607 ; AVX2-NEXT: vmovdqa %xmm5, (%rdx)
608 ; AVX2-NEXT: vmovdqa %xmm6, (%rcx)
609 ; AVX2-NEXT: vmovdqa %xmm1, (%r8)
610 ; AVX2-NEXT: vzeroupper
613 ; AVX2-FP-LABEL: load_i16_stride4_vf8:
615 ; AVX2-FP-NEXT: vpxor %xmm0, %xmm0, %xmm0
616 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
617 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
618 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
619 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
620 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm2
621 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
622 ; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
623 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1
624 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm2
625 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm3
626 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm4
627 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
628 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm4, %xmm6
629 ; AVX2-FP-NEXT: vpshufb %xmm5, %xmm3, %xmm5
630 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
631 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
632 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm7
633 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm6
634 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
635 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
636 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
637 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,2,0,4,5,6,7]
638 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
639 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,1,2,0,4,5,6,7]
640 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
641 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
642 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7]
643 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
644 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7]
645 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
646 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
647 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
648 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
649 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
650 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
651 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
652 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
653 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
654 ; AVX2-FP-NEXT: vmovdqa %xmm0, (%rsi)
655 ; AVX2-FP-NEXT: vmovdqa %xmm5, (%rdx)
656 ; AVX2-FP-NEXT: vmovdqa %xmm6, (%rcx)
657 ; AVX2-FP-NEXT: vmovdqa %xmm1, (%r8)
658 ; AVX2-FP-NEXT: vzeroupper
661 ; AVX2-FCP-LABEL: load_i16_stride4_vf8:
663 ; AVX2-FCP-NEXT: vpxor %xmm0, %xmm0, %xmm0
664 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
665 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
666 ; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
667 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
668 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
669 ; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
670 ; AVX2-FCP-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
671 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm1
672 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
673 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm3
674 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm4
675 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
676 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm4, %xmm6
677 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm3, %xmm5
678 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
679 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
680 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm2, %xmm7
681 ; AVX2-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6
682 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
683 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3]
684 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
685 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm4[0,1,2,0,4,5,6,7]
686 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
687 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm3[0,1,2,0,4,5,6,7]
688 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
689 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
690 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm2[2,0,2,3,4,5,6,7]
691 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
692 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7]
693 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
694 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
695 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
696 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
697 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
698 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
699 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
700 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
701 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
702 ; AVX2-FCP-NEXT: vmovdqa %xmm0, (%rsi)
703 ; AVX2-FCP-NEXT: vmovdqa %xmm5, (%rdx)
704 ; AVX2-FCP-NEXT: vmovdqa %xmm6, (%rcx)
705 ; AVX2-FCP-NEXT: vmovdqa %xmm1, (%r8)
706 ; AVX2-FCP-NEXT: vzeroupper
707 ; AVX2-FCP-NEXT: retq
709 ; AVX512-LABEL: load_i16_stride4_vf8:
711 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
712 ; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm1
713 ; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm2
714 ; AVX512-NEXT: vpsrlq $48, %zmm0, %zmm3
715 ; AVX512-NEXT: vpmovqw %zmm0, (%rsi)
716 ; AVX512-NEXT: vpmovqw %zmm1, (%rdx)
717 ; AVX512-NEXT: vpmovqw %zmm2, (%rcx)
718 ; AVX512-NEXT: vpmovqw %zmm3, (%r8)
719 ; AVX512-NEXT: vzeroupper
722 ; AVX512-FCP-LABEL: load_i16_stride4_vf8:
723 ; AVX512-FCP: # %bb.0:
724 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
725 ; AVX512-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1
726 ; AVX512-FCP-NEXT: vpsrlq $32, %zmm0, %zmm2
727 ; AVX512-FCP-NEXT: vpsrlq $48, %zmm0, %zmm3
728 ; AVX512-FCP-NEXT: vpmovqw %zmm0, (%rsi)
729 ; AVX512-FCP-NEXT: vpmovqw %zmm1, (%rdx)
730 ; AVX512-FCP-NEXT: vpmovqw %zmm2, (%rcx)
731 ; AVX512-FCP-NEXT: vpmovqw %zmm3, (%r8)
732 ; AVX512-FCP-NEXT: vzeroupper
733 ; AVX512-FCP-NEXT: retq
735 ; AVX512DQ-LABEL: load_i16_stride4_vf8:
737 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
738 ; AVX512DQ-NEXT: vpsrlq $16, %zmm0, %zmm1
739 ; AVX512DQ-NEXT: vpsrlq $32, %zmm0, %zmm2
740 ; AVX512DQ-NEXT: vpsrlq $48, %zmm0, %zmm3
741 ; AVX512DQ-NEXT: vpmovqw %zmm0, (%rsi)
742 ; AVX512DQ-NEXT: vpmovqw %zmm1, (%rdx)
743 ; AVX512DQ-NEXT: vpmovqw %zmm2, (%rcx)
744 ; AVX512DQ-NEXT: vpmovqw %zmm3, (%r8)
745 ; AVX512DQ-NEXT: vzeroupper
746 ; AVX512DQ-NEXT: retq
748 ; AVX512DQ-FCP-LABEL: load_i16_stride4_vf8:
749 ; AVX512DQ-FCP: # %bb.0:
750 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
751 ; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1
752 ; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm0, %zmm2
753 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm0, %zmm3
754 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm0, (%rsi)
755 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm1, (%rdx)
756 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm2, (%rcx)
757 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, (%r8)
758 ; AVX512DQ-FCP-NEXT: vzeroupper
759 ; AVX512DQ-FCP-NEXT: retq
761 ; AVX512BW-LABEL: load_i16_stride4_vf8:
763 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
764 ; AVX512BW-NEXT: vpsrlq $16, %zmm0, %zmm1
765 ; AVX512BW-NEXT: vpsrlq $32, %zmm0, %zmm2
766 ; AVX512BW-NEXT: vpsrlq $48, %zmm0, %zmm3
767 ; AVX512BW-NEXT: vpmovqw %zmm0, (%rsi)
768 ; AVX512BW-NEXT: vpmovqw %zmm1, (%rdx)
769 ; AVX512BW-NEXT: vpmovqw %zmm2, (%rcx)
770 ; AVX512BW-NEXT: vpmovqw %zmm3, (%r8)
771 ; AVX512BW-NEXT: vzeroupper
772 ; AVX512BW-NEXT: retq
774 ; AVX512BW-FCP-LABEL: load_i16_stride4_vf8:
775 ; AVX512BW-FCP: # %bb.0:
776 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
777 ; AVX512BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1
778 ; AVX512BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm2
779 ; AVX512BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm3
780 ; AVX512BW-FCP-NEXT: vpmovqw %zmm0, (%rsi)
781 ; AVX512BW-FCP-NEXT: vpmovqw %zmm1, (%rdx)
782 ; AVX512BW-FCP-NEXT: vpmovqw %zmm2, (%rcx)
783 ; AVX512BW-FCP-NEXT: vpmovqw %zmm3, (%r8)
784 ; AVX512BW-FCP-NEXT: vzeroupper
785 ; AVX512BW-FCP-NEXT: retq
787 ; AVX512DQ-BW-LABEL: load_i16_stride4_vf8:
788 ; AVX512DQ-BW: # %bb.0:
789 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
790 ; AVX512DQ-BW-NEXT: vpsrlq $16, %zmm0, %zmm1
791 ; AVX512DQ-BW-NEXT: vpsrlq $32, %zmm0, %zmm2
792 ; AVX512DQ-BW-NEXT: vpsrlq $48, %zmm0, %zmm3
793 ; AVX512DQ-BW-NEXT: vpmovqw %zmm0, (%rsi)
794 ; AVX512DQ-BW-NEXT: vpmovqw %zmm1, (%rdx)
795 ; AVX512DQ-BW-NEXT: vpmovqw %zmm2, (%rcx)
796 ; AVX512DQ-BW-NEXT: vpmovqw %zmm3, (%r8)
797 ; AVX512DQ-BW-NEXT: vzeroupper
798 ; AVX512DQ-BW-NEXT: retq
800 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride4_vf8:
801 ; AVX512DQ-BW-FCP: # %bb.0:
802 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
803 ; AVX512DQ-BW-FCP-NEXT: vpsrlq $16, %zmm0, %zmm1
804 ; AVX512DQ-BW-FCP-NEXT: vpsrlq $32, %zmm0, %zmm2
805 ; AVX512DQ-BW-FCP-NEXT: vpsrlq $48, %zmm0, %zmm3
806 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %zmm0, (%rsi)
807 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %zmm1, (%rdx)
808 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %zmm2, (%rcx)
809 ; AVX512DQ-BW-FCP-NEXT: vpmovqw %zmm3, (%r8)
810 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
811 ; AVX512DQ-BW-FCP-NEXT: retq
812 %wide.vec = load <32 x i16>, ptr %in.vec, align 64
813 %strided.vec0 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
814 %strided.vec1 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
815 %strided.vec2 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
816 %strided.vec3 = shufflevector <32 x i16> %wide.vec, <32 x i16> poison, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
817 store <8 x i16> %strided.vec0, ptr %out.vec0, align 64
818 store <8 x i16> %strided.vec1, ptr %out.vec1, align 64
819 store <8 x i16> %strided.vec2, ptr %out.vec2, align 64
820 store <8 x i16> %strided.vec3, ptr %out.vec3, align 64
824 define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
825 ; SSE-LABEL: load_i16_stride4_vf16:
827 ; SSE-NEXT: movdqa 96(%rdi), %xmm4
828 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
829 ; SSE-NEXT: movdqa 64(%rdi), %xmm1
830 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
831 ; SSE-NEXT: movdqa 80(%rdi), %xmm3
832 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
833 ; SSE-NEXT: movdqa (%rdi), %xmm8
834 ; SSE-NEXT: movdqa 16(%rdi), %xmm10
835 ; SSE-NEXT: movdqa 32(%rdi), %xmm7
836 ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
837 ; SSE-NEXT: movdqa 48(%rdi), %xmm6
838 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm10[0,2,2,3]
839 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[0,2,2,3,4,5,6,7]
840 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm8[0,2,2,3]
841 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,2,2,3,4,5,6,7]
842 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
843 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm6[0,2,2,3]
844 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm12[0,1,0,2,4,5,6,7]
845 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm7[0,2,2,3]
846 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm13[0,1,0,2,4,5,6,7]
847 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1]
848 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm2[0],xmm7[1]
849 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm3[0,2,2,3]
850 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,2,2,3,4,5,6,7]
851 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm1[0,2,2,3]
852 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,2,2,3,4,5,6,7]
853 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
854 ; SSE-NEXT: movdqa 112(%rdi), %xmm11
855 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3]
856 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,0,2,4,5,6,7]
857 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm4[0,2,2,3]
858 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm2[0,1,0,2,4,5,6,7]
859 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
860 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm0[0],xmm4[1]
861 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,3,2,3,4,5,6,7]
862 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[1,3,2,3,4,5,6,7]
863 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
864 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,1,3,4,5,6,7]
865 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm13[0,1,1,3,4,5,6,7]
866 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
867 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1]
868 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[1,3,2,3,4,5,6,7]
869 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[1,3,2,3,4,5,6,7]
870 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
871 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7]
872 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm2[0,1,1,3,4,5,6,7]
873 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
874 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1]
875 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm10[3,1,2,3]
876 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,0,2,3,4,5,6,7]
877 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm8[3,1,2,3]
878 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
879 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
880 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm6[3,1,2,3]
881 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,1,2,0,4,5,6,7]
882 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
883 ; SSE-NEXT: # xmm10 = mem[3,1,2,3]
884 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm10[0,1,2,0,4,5,6,7]
885 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1]
886 ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm3[0],xmm6[1]
887 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
888 ; SSE-NEXT: # xmm3 = mem[3,1,2,3]
889 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm3[2,0,2,3,4,5,6,7]
890 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
891 ; SSE-NEXT: # xmm12 = mem[3,1,2,3]
892 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm12[2,0,2,3,4,5,6,7]
893 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm8[0],xmm13[1],xmm8[1]
894 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm11[3,1,2,3]
895 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm8[0,1,2,0,4,5,6,7]
896 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
897 ; SSE-NEXT: # xmm14 = mem[3,1,2,3]
898 ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm14[0,1,2,0,4,5,6,7]
899 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1]
900 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm13[0],xmm15[1]
901 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
902 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
903 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
904 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,1,3,1,4,5,6,7]
905 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,1,3,1,4,5,6,7]
906 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
907 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
908 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[3,1,2,3,4,5,6,7]
909 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm12[3,1,2,3,4,5,6,7]
910 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
911 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,3,1,4,5,6,7]
912 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7]
913 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
914 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1]
915 ; SSE-NEXT: movapd %xmm4, 16(%rsi)
916 ; SSE-NEXT: movapd %xmm7, (%rsi)
917 ; SSE-NEXT: movapd %xmm9, 16(%rdx)
918 ; SSE-NEXT: movapd %xmm5, (%rdx)
919 ; SSE-NEXT: movapd %xmm15, 16(%rcx)
920 ; SSE-NEXT: movapd %xmm6, (%rcx)
921 ; SSE-NEXT: movapd %xmm3, 16(%r8)
922 ; SSE-NEXT: movapd %xmm1, (%r8)
925 ; AVX-LABEL: load_i16_stride4_vf16:
927 ; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
928 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm5
929 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm5[0],xmm1[1,2,3],xmm5[4],xmm1[5,6,7]
930 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm6
931 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm6[0],xmm1[1,2,3],xmm6[4],xmm1[5,6,7]
932 ; AVX-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
933 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm7
934 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm1[1,2,3],xmm7[4],xmm1[5,6,7]
935 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm8
936 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm8[0],xmm1[1,2,3],xmm8[4],xmm1[5,6,7]
937 ; AVX-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
938 ; AVX-NEXT: vpackusdw %xmm0, %xmm2, %xmm0
939 ; AVX-NEXT: vmovdqa (%rdi), %xmm2
940 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm3
941 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm4
942 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm9
943 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm9[0],xmm1[1,2,3],xmm9[4],xmm1[5,6,7]
944 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm4[0],xmm1[1,2,3],xmm4[4],xmm1[5,6,7]
945 ; AVX-NEXT: vpackusdw %xmm10, %xmm11, %xmm10
946 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm3[0],xmm1[1,2,3],xmm3[4],xmm1[5,6,7]
947 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3],xmm2[4],xmm1[5,6,7]
948 ; AVX-NEXT: vpackusdw %xmm11, %xmm1, %xmm1
949 ; AVX-NEXT: vpackusdw %xmm10, %xmm1, %xmm1
950 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm5[0,2,2,3]
951 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
952 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm6[0,2,2,3]
953 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7]
954 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
955 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,2,2,3]
956 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
957 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm8[0,2,2,3]
958 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
959 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
960 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7]
961 ; AVX-NEXT: vinsertf128 $1, %xmm10, %ymm0, %ymm10
962 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm9[0,2,2,3]
963 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7]
964 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,2,2,3]
965 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7]
966 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
967 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm3[0,2,2,3]
968 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
969 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm2[0,2,2,3]
970 ; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm13[1,3,2,3,4,5,6,7]
971 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
972 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4,5,6,7]
973 ; AVX-NEXT: vblendps {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
974 ; AVX-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
975 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,1,2,0,4,5,6,7]
976 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
977 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[0,1,2,0,4,5,6,7]
978 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
979 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
980 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm7[2,0,2,3,4,5,6,7]
981 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3]
982 ; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm8[2,0,2,3,4,5,6,7]
983 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
984 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3],xmm11[4,5,6,7]
985 ; AVX-NEXT: vinsertf128 $1, %xmm11, %ymm0, %ymm11
986 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[3,1,2,3]
987 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm9[0,1,2,0,4,5,6,7]
988 ; AVX-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
989 ; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm4[0,1,2,0,4,5,6,7]
990 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
991 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
992 ; AVX-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[2,0,2,3,4,5,6,7]
993 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
994 ; AVX-NEXT: vpshuflw {{.*#+}} xmm14 = xmm2[2,0,2,3,4,5,6,7]
995 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
996 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3],xmm12[4,5,6,7]
997 ; AVX-NEXT: vblendps {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
998 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
999 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
1000 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1001 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7]
1002 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7]
1003 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
1004 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
1005 ; AVX-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm5
1006 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm9[0,1,3,1,4,5,6,7]
1007 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
1008 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
1009 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
1010 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
1011 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1012 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm4[4,5,6,7]
1013 ; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
1014 ; AVX-NEXT: vmovdqa %xmm1, (%rsi)
1015 ; AVX-NEXT: vmovdqa %xmm0, 16(%rsi)
1016 ; AVX-NEXT: vmovaps %ymm10, (%rdx)
1017 ; AVX-NEXT: vmovaps %ymm11, (%rcx)
1018 ; AVX-NEXT: vmovaps %ymm2, (%r8)
1019 ; AVX-NEXT: vzeroupper
1022 ; AVX2-LABEL: load_i16_stride4_vf16:
1024 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
1025 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
1026 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
1027 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1028 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
1029 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
1030 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1031 ; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
1032 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
1033 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
1034 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1035 ; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
1036 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
1037 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
1038 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
1039 ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
1040 ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
1041 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1042 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
1043 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1044 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
1045 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2
1046 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm3
1047 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm5
1048 ; AVX2-NEXT: vmovdqa 112(%rdi), %xmm6
1049 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3]
1050 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
1051 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm7
1052 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm7[0,2,2,3]
1053 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
1054 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
1055 ; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
1056 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm8
1057 ; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,2,2,3]
1058 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
1059 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm10
1060 ; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,2,2,3]
1061 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
1062 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
1063 ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
1064 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3,4,5],ymm4[6,7]
1065 ; AVX2-NEXT: vpshufd {{.*#+}} xmm9 = xmm5[0,2,2,3]
1066 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
1067 ; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,2,2,3]
1068 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7]
1069 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
1070 ; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[0,2,2,3]
1071 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
1072 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm1[0,2,2,3]
1073 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
1074 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
1075 ; AVX2-NEXT: vpblendd {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3]
1076 ; AVX2-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
1077 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
1078 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[0,1,2,0,4,5,6,7]
1079 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
1080 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm7[0,1,2,0,4,5,6,7]
1081 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1]
1082 ; AVX2-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
1083 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3]
1084 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[2,0,2,3,4,5,6,7]
1085 ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3]
1086 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm10[2,0,2,3,4,5,6,7]
1087 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
1088 ; AVX2-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
1089 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
1090 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
1091 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm11 = xmm5[0,1,2,0,4,5,6,7]
1092 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
1093 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[0,1,2,0,4,5,6,7]
1094 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
1095 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
1096 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm2[2,0,2,3,4,5,6,7]
1097 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
1098 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,0,2,3,4,5,6,7]
1099 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
1100 ; AVX2-NEXT: vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
1101 ; AVX2-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3],ymm9[4,5,6,7]
1102 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
1103 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7]
1104 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
1105 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
1106 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7]
1107 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[3,1,2,3,4,5,6,7]
1108 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
1109 ; AVX2-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
1110 ; AVX2-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
1111 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
1112 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
1113 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
1114 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
1115 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
1116 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1117 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3]
1118 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
1119 ; AVX2-NEXT: vmovdqa %ymm0, (%rsi)
1120 ; AVX2-NEXT: vmovdqa %ymm4, (%rdx)
1121 ; AVX2-NEXT: vmovdqa %ymm9, (%rcx)
1122 ; AVX2-NEXT: vmovdqa %ymm1, (%r8)
1123 ; AVX2-NEXT: vzeroupper
1126 ; AVX2-FP-LABEL: load_i16_stride4_vf16:
1128 ; AVX2-FP-NEXT: vpxor %xmm0, %xmm0, %xmm0
1129 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
1130 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
1131 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
1132 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
1133 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
1134 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1135 ; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
1136 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
1137 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
1138 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
1139 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
1140 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
1141 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
1142 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm3
1143 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
1144 ; AVX2-FP-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
1145 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1146 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
1147 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
1148 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm1
1149 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm3
1150 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm4
1151 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm5
1152 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
1153 ; AVX2-FP-NEXT: vmovdqa 112(%rdi), %xmm6
1154 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm6, %xmm7
1155 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm8
1156 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm8, %xmm9
1157 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
1158 ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
1159 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm9 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
1160 ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm10
1161 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm10, %xmm11
1162 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm12
1163 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm12, %xmm13
1164 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
1165 ; AVX2-FP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
1166 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm11[0,1,2,3,4,5],ymm7[6,7]
1167 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm5, %xmm11
1168 ; AVX2-FP-NEXT: vpshufb %xmm2, %xmm4, %xmm2
1169 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1]
1170 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm3, %xmm11
1171 ; AVX2-FP-NEXT: vpshufb %xmm9, %xmm1, %xmm9
1172 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1]
1173 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
1174 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm7[4,5,6,7]
1175 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
1176 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm6[0,1,2,0,4,5,6,7]
1177 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3]
1178 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[0,1,2,0,4,5,6,7]
1179 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm9[0],xmm7[0],xmm9[1],xmm7[1]
1180 ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
1181 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm10[3,1,2,3]
1182 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm9[2,0,2,3,4,5,6,7]
1183 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm12[3,1,2,3]
1184 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,0,2,3,4,5,6,7]
1185 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
1186 ; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
1187 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3,4,5],ymm7[6,7]
1188 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
1189 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm5[0,1,2,0,4,5,6,7]
1190 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
1191 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm4[0,1,2,0,4,5,6,7]
1192 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
1193 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
1194 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm3[2,0,2,3,4,5,6,7]
1195 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
1196 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm1[2,0,2,3,4,5,6,7]
1197 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
1198 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
1199 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm10[0,1,2,3],ymm7[4,5,6,7]
1200 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
1201 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7]
1202 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm8[0],xmm6[0],xmm8[1],xmm6[1]
1203 ; AVX2-FP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
1204 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[3,1,2,3,4,5,6,7]
1205 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm11[3,1,2,3,4,5,6,7]
1206 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
1207 ; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
1208 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm6 = ymm8[0,1,2,3,4,5],ymm6[6,7]
1209 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
1210 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
1211 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
1212 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
1213 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
1214 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1215 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
1216 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
1217 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rsi)
1218 ; AVX2-FP-NEXT: vmovdqa %ymm2, (%rdx)
1219 ; AVX2-FP-NEXT: vmovdqa %ymm7, (%rcx)
1220 ; AVX2-FP-NEXT: vmovdqa %ymm1, (%r8)
1221 ; AVX2-FP-NEXT: vzeroupper
1222 ; AVX2-FP-NEXT: retq
1224 ; AVX2-FCP-LABEL: load_i16_stride4_vf16:
1225 ; AVX2-FCP: # %bb.0:
1226 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm1
1227 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm2
1228 ; AVX2-FCP-NEXT: vpxor %xmm0, %xmm0, %xmm0
1229 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
1230 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm4
1231 ; AVX2-FCP-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
1232 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
1233 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm4
1234 ; AVX2-FCP-NEXT: vpackusdw %xmm4, %xmm0, %xmm0
1235 ; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
1236 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,2,2,3,0,2,4,6]
1237 ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm4
1238 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29]
1239 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm6
1240 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm3, %ymm3
1241 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm7
1242 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
1243 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
1244 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm6
1245 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm7
1246 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm8
1247 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm9
1248 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
1249 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm11
1250 ; AVX2-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm10
1251 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1]
1252 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
1253 ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm7, %xmm12
1254 ; AVX2-FCP-NEXT: vpshufb %xmm11, %xmm6, %xmm11
1255 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1]
1256 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3]
1257 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
1258 ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm4, %ymm4
1259 ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm3, %ymm3
1260 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6,7]
1261 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7]
1262 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [1,3,2,3,1,3,5,7]
1263 ; AVX2-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm2
1264 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm10
1265 ; AVX2-FCP-NEXT: vpermd %ymm1, %ymm4, %ymm1
1266 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm4
1267 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5],ymm10[6,7]
1268 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[3,1,2,3]
1269 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[0,1,2,0,4,5,6,7]
1270 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[3,1,2,3]
1271 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[0,1,2,0,4,5,6,7]
1272 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
1273 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
1274 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm7[2,0,2,3,4,5,6,7]
1275 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
1276 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm6[2,0,2,3,4,5,6,7]
1277 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm12[0],xmm10[0],xmm12[1],xmm10[1]
1278 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
1279 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm9[0,1,2,3],ymm4[4,5,6,7]
1280 ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm2, %ymm2
1281 ; AVX2-FCP-NEXT: vpshufb %ymm11, %ymm1, %ymm1
1282 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
1283 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,3,1,4,5,6,7]
1284 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm8[0,1,3,1,4,5,6,7]
1285 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
1286 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[3,1,2,3,4,5,6,7]
1287 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
1288 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
1289 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
1290 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
1291 ; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rsi)
1292 ; AVX2-FCP-NEXT: vmovdqa %ymm3, (%rdx)
1293 ; AVX2-FCP-NEXT: vmovdqa %ymm4, (%rcx)
1294 ; AVX2-FCP-NEXT: vmovdqa %ymm1, (%r8)
1295 ; AVX2-FCP-NEXT: vzeroupper
1296 ; AVX2-FCP-NEXT: retq
1298 ; AVX512-LABEL: load_i16_stride4_vf16:
1300 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm0
1301 ; AVX512-NEXT: vpmovqw %ymm0, %xmm0
1302 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1303 ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm1
1304 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
1305 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,0,2,4,5,6,7]
1306 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm4
1307 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,2,2,3]
1308 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,0,2,4,5,6,7]
1309 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
1310 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
1311 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
1312 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm3
1313 ; AVX512-NEXT: vpmovqw %zmm3, %xmm6
1314 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
1315 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm6
1316 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm7
1317 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
1318 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
1319 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
1320 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
1321 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
1322 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
1323 ; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,2,2,3]
1324 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
1325 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
1326 ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
1327 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
1328 ; AVX512-NEXT: vpsrlq $16, %zmm3, %zmm5
1329 ; AVX512-NEXT: vpmovqw %zmm5, %xmm5
1330 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
1331 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
1332 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,1,2,0,4,5,6,7]
1333 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
1334 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,1,2,0,4,5,6,7]
1335 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
1336 ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
1337 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
1338 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,0,2,3,4,5,6,7]
1339 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
1340 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[2,0,2,3,4,5,6,7]
1341 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
1342 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
1343 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
1344 ; AVX512-NEXT: vpsrlq $32, %zmm3, %zmm8
1345 ; AVX512-NEXT: vpmovqw %zmm8, %xmm8
1346 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
1347 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
1348 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
1349 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1350 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
1351 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7]
1352 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
1353 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
1354 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
1355 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
1356 ; AVX512-NEXT: vpsrlq $48, %zmm3, %zmm3
1357 ; AVX512-NEXT: vpmovqw %zmm3, %xmm3
1358 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
1359 ; AVX512-NEXT: vmovdqa %ymm0, (%rsi)
1360 ; AVX512-NEXT: vmovdqa %ymm2, (%rdx)
1361 ; AVX512-NEXT: vmovdqa %ymm5, (%rcx)
1362 ; AVX512-NEXT: vmovdqa %ymm1, (%r8)
1363 ; AVX512-NEXT: vzeroupper
1366 ; AVX512-FCP-LABEL: load_i16_stride4_vf16:
1367 ; AVX512-FCP: # %bb.0:
1368 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
1369 ; AVX512-FCP-NEXT: # ymm0 = mem[0,1,0,1]
1370 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6]
1371 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm2
1372 ; AVX512-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm3
1373 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm4
1374 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
1375 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
1376 ; AVX512-FCP-NEXT: vpermd %ymm6, %ymm1, %ymm1
1377 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm7
1378 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,2,2,3,4,6,12,14]
1379 ; AVX512-FCP-NEXT: vpermt2d %ymm4, %ymm8, %ymm7
1380 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
1381 ; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm9
1382 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
1383 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
1384 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
1385 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1
1386 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
1387 ; AVX512-FCP-NEXT: vpsrlq $16, %zmm4, %zmm3
1388 ; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3
1389 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
1390 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7]
1391 ; AVX512-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2
1392 ; AVX512-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0
1393 ; AVX512-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm3
1394 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm5
1395 ; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm8, %ymm5
1396 ; AVX512-FCP-NEXT: vpsrlq $32, %zmm4, %zmm0
1397 ; AVX512-FCP-NEXT: vpmovqw %zmm0, %xmm0
1398 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
1399 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
1400 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
1401 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
1402 ; AVX512-FCP-NEXT: vpsrlq $48, %zmm4, %zmm3
1403 ; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3
1404 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
1405 ; AVX512-FCP-NEXT: vmovdqa %ymm7, (%rsi)
1406 ; AVX512-FCP-NEXT: vmovdqa %ymm1, (%rdx)
1407 ; AVX512-FCP-NEXT: vmovdqa %ymm0, (%rcx)
1408 ; AVX512-FCP-NEXT: vmovdqa %ymm2, (%r8)
1409 ; AVX512-FCP-NEXT: vzeroupper
1410 ; AVX512-FCP-NEXT: retq
1412 ; AVX512DQ-LABEL: load_i16_stride4_vf16:
1413 ; AVX512DQ: # %bb.0:
1414 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm0
1415 ; AVX512DQ-NEXT: vpmovqw %ymm0, %xmm0
1416 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
1417 ; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm1
1418 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,2,3]
1419 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,0,2,4,5,6,7]
1420 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm4
1421 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,2,2,3]
1422 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm5[0,1,0,2,4,5,6,7]
1423 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
1424 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
1425 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
1426 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm3
1427 ; AVX512DQ-NEXT: vpmovqw %zmm3, %xmm6
1428 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm0[4,5,6,7]
1429 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm6
1430 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm7
1431 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
1432 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
1433 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
1434 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
1435 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm7[0,2,2,3]
1436 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
1437 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm6[0,2,2,3]
1438 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
1439 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
1440 ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
1441 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3,4,5],ymm2[6,7]
1442 ; AVX512DQ-NEXT: vpsrlq $16, %zmm3, %zmm5
1443 ; AVX512DQ-NEXT: vpmovqw %zmm5, %xmm5
1444 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm5[0,1,2,3],ymm2[4,5,6,7]
1445 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
1446 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm1[0,1,2,0,4,5,6,7]
1447 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
1448 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,1,2,0,4,5,6,7]
1449 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
1450 ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
1451 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
1452 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm7[2,0,2,3,4,5,6,7]
1453 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
1454 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm6[2,0,2,3,4,5,6,7]
1455 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
1456 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
1457 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
1458 ; AVX512DQ-NEXT: vpsrlq $32, %zmm3, %zmm8
1459 ; AVX512DQ-NEXT: vpmovqw %zmm8, %xmm8
1460 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3],ymm5[4,5,6,7]
1461 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
1462 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
1463 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
1464 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
1465 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm7[3,1,2,3,4,5,6,7]
1466 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
1467 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
1468 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
1469 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
1470 ; AVX512DQ-NEXT: vpsrlq $48, %zmm3, %zmm3
1471 ; AVX512DQ-NEXT: vpmovqw %zmm3, %xmm3
1472 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
1473 ; AVX512DQ-NEXT: vmovdqa %ymm0, (%rsi)
1474 ; AVX512DQ-NEXT: vmovdqa %ymm2, (%rdx)
1475 ; AVX512DQ-NEXT: vmovdqa %ymm5, (%rcx)
1476 ; AVX512DQ-NEXT: vmovdqa %ymm1, (%r8)
1477 ; AVX512DQ-NEXT: vzeroupper
1478 ; AVX512DQ-NEXT: retq
1480 ; AVX512DQ-FCP-LABEL: load_i16_stride4_vf16:
1481 ; AVX512DQ-FCP: # %bb.0:
1482 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm0 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
1483 ; AVX512DQ-FCP-NEXT: # ymm0 = mem[0,1,0,1]
1484 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [0,2,2,3,0,2,4,6]
1485 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm2
1486 ; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm1, %ymm3
1487 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm3, %ymm4
1488 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
1489 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm6
1490 ; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm1, %ymm1
1491 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm7
1492 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,2,2,3,4,6,12,14]
1493 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm4, %ymm8, %ymm7
1494 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm4
1495 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm9
1496 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
1497 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
1498 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
1499 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1
1500 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
1501 ; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm4, %zmm3
1502 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3
1503 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
1504 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [1,3,2,3,1,3,5,7]
1505 ; AVX512DQ-FCP-NEXT: vpermd %ymm2, %ymm3, %ymm2
1506 ; AVX512DQ-FCP-NEXT: vpshufb %ymm0, %ymm2, %ymm0
1507 ; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm3, %ymm3
1508 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm5
1509 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm8, %ymm5
1510 ; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm4, %zmm0
1511 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm0, %xmm0
1512 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm5[4,5,6,7]
1513 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm2, %ymm2
1514 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
1515 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
1516 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm4, %zmm3
1517 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3
1518 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
1519 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm7, (%rsi)
1520 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm1, (%rdx)
1521 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm0, (%rcx)
1522 ; AVX512DQ-FCP-NEXT: vmovdqa %ymm2, (%r8)
1523 ; AVX512DQ-FCP-NEXT: vzeroupper
1524 ; AVX512DQ-FCP-NEXT: retq
1526 ; AVX512BW-LABEL: load_i16_stride4_vf16:
1527 ; AVX512BW: # %bb.0:
1528 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
1529 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm1
1530 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm2
1531 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm0
1532 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
1533 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3
1534 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
1535 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm4
1536 ; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
1537 ; AVX512BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm5
1538 ; AVX512BW-NEXT: vmovdqa %ymm0, (%rsi)
1539 ; AVX512BW-NEXT: vmovdqa %ymm3, (%rdx)
1540 ; AVX512BW-NEXT: vmovdqa %ymm4, (%rcx)
1541 ; AVX512BW-NEXT: vmovdqa %ymm5, (%r8)
1542 ; AVX512BW-NEXT: vzeroupper
1543 ; AVX512BW-NEXT: retq
1545 ; AVX512BW-FCP-LABEL: load_i16_stride4_vf16:
1546 ; AVX512BW-FCP: # %bb.0:
1547 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
1548 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
1549 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
1550 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm0
1551 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
1552 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm3
1553 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
1554 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm4
1555 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
1556 ; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm5
1557 ; AVX512BW-FCP-NEXT: vmovdqa %ymm0, (%rsi)
1558 ; AVX512BW-FCP-NEXT: vmovdqa %ymm3, (%rdx)
1559 ; AVX512BW-FCP-NEXT: vmovdqa %ymm4, (%rcx)
1560 ; AVX512BW-FCP-NEXT: vmovdqa %ymm5, (%r8)
1561 ; AVX512BW-FCP-NEXT: vzeroupper
1562 ; AVX512BW-FCP-NEXT: retq
1564 ; AVX512DQ-BW-LABEL: load_i16_stride4_vf16:
1565 ; AVX512DQ-BW: # %bb.0:
1566 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
1567 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm1
1568 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm2
1569 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm0
1570 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
1571 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm3
1572 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
1573 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm4
1574 ; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
1575 ; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm1, %zmm5
1576 ; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rsi)
1577 ; AVX512DQ-BW-NEXT: vmovdqa %ymm3, (%rdx)
1578 ; AVX512DQ-BW-NEXT: vmovdqa %ymm4, (%rcx)
1579 ; AVX512DQ-BW-NEXT: vmovdqa %ymm5, (%r8)
1580 ; AVX512DQ-BW-NEXT: vzeroupper
1581 ; AVX512DQ-BW-NEXT: retq
1583 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride4_vf16:
1584 ; AVX512DQ-BW-FCP: # %bb.0:
1585 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
1586 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm1
1587 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2
1588 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm0
1589 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
1590 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm3
1591 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
1592 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm4
1593 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
1594 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm1, %zmm5
1595 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm0, (%rsi)
1596 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm3, (%rdx)
1597 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm4, (%rcx)
1598 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, (%r8)
1599 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
1600 ; AVX512DQ-BW-FCP-NEXT: retq
1601 %wide.vec = load <64 x i16>, ptr %in.vec, align 64
1602 %strided.vec0 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
1603 %strided.vec1 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
1604 %strided.vec2 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <16 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62>
1605 %strided.vec3 = shufflevector <64 x i16> %wide.vec, <64 x i16> poison, <16 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
1606 store <16 x i16> %strided.vec0, ptr %out.vec0, align 64
1607 store <16 x i16> %strided.vec1, ptr %out.vec1, align 64
1608 store <16 x i16> %strided.vec2, ptr %out.vec2, align 64
1609 store <16 x i16> %strided.vec3, ptr %out.vec3, align 64
1613 define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
1614 ; SSE-LABEL: load_i16_stride4_vf32:
1616 ; SSE-NEXT: subq $248, %rsp
1617 ; SSE-NEXT: movdqa 160(%rdi), %xmm3
1618 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1619 ; SSE-NEXT: movdqa 128(%rdi), %xmm4
1620 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1621 ; SSE-NEXT: movdqa 144(%rdi), %xmm5
1622 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1623 ; SSE-NEXT: movdqa 96(%rdi), %xmm2
1624 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1625 ; SSE-NEXT: movdqa 112(%rdi), %xmm6
1626 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1627 ; SSE-NEXT: movdqa 64(%rdi), %xmm1
1628 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1629 ; SSE-NEXT: movdqa 80(%rdi), %xmm0
1630 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1631 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1632 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1633 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1634 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1635 ; SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
1636 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
1637 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1638 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
1639 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1640 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
1641 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm2[0,2,2,3]
1642 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,1,0,2,4,5,6,7]
1643 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1644 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
1645 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1646 ; SSE-NEXT: pshufd {{.*#+}} xmm15 = xmm5[0,2,2,3]
1647 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,2,2,3,4,5,6,7]
1648 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm4[0,2,2,3]
1649 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[0,2,2,3,4,5,6,7]
1650 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1651 ; SSE-NEXT: movdqa 176(%rdi), %xmm0
1652 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1653 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm0[0,2,2,3]
1654 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,0,2,4,5,6,7]
1655 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm3[0,2,2,3]
1656 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm9[0,1,0,2,4,5,6,7]
1657 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1658 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
1659 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1660 ; SSE-NEXT: movdqa (%rdi), %xmm1
1661 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1662 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
1663 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1664 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm0[0,2,2,3]
1665 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,2,2,3,4,5,6,7]
1666 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[0,2,2,3]
1667 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,2,2,3,4,5,6,7]
1668 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1669 ; SSE-NEXT: movdqa 32(%rdi), %xmm2
1670 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1671 ; SSE-NEXT: movdqa 48(%rdi), %xmm0
1672 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1673 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,2,2,3]
1674 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,0,2,4,5,6,7]
1675 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,2,2,3]
1676 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,0,2,4,5,6,7]
1677 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1678 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
1679 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1680 ; SSE-NEXT: movdqa 192(%rdi), %xmm0
1681 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1682 ; SSE-NEXT: movdqa 208(%rdi), %xmm1
1683 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1684 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3]
1685 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7]
1686 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
1687 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7]
1688 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1689 ; SSE-NEXT: movdqa 224(%rdi), %xmm2
1690 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1691 ; SSE-NEXT: movdqa 240(%rdi), %xmm1
1692 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1693 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
1694 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,0,2,4,5,6,7]
1695 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1696 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm2[0,1,0,2,4,5,6,7]
1697 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
1698 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm0[0],xmm14[1]
1699 ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1700 ; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1701 ; SSE-NEXT: # xmm0 = mem[1,3,2,3,4,5,6,7]
1702 ; SSE-NEXT: pshuflw $237, (%rsp), %xmm1 # 16-byte Folded Reload
1703 ; SSE-NEXT: # xmm1 = mem[1,3,2,3,4,5,6,7]
1704 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1705 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1706 ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7]
1707 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7]
1708 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
1709 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1]
1710 ; SSE-NEXT: movapd %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1711 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[1,3,2,3,4,5,6,7]
1712 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm11[1,3,2,3,4,5,6,7]
1713 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1714 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm10[0,1,1,3,4,5,6,7]
1715 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
1716 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
1717 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm1[0],xmm9[1]
1718 ; SSE-NEXT: movapd %xmm9, (%rsp) # 16-byte Spill
1719 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,3,2,3,4,5,6,7]
1720 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[1,3,2,3,4,5,6,7]
1721 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1722 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7]
1723 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm6[0,1,1,3,4,5,6,7]
1724 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
1725 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm1[0],xmm13[1]
1726 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,3,2,3,4,5,6,7]
1727 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[1,3,2,3,4,5,6,7]
1728 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1729 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7]
1730 ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm2[0,1,1,3,4,5,6,7]
1731 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
1732 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1]
1733 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1734 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
1735 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1736 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1737 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
1738 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1739 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
1740 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
1741 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1742 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1743 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
1744 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1745 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
1746 ; SSE-NEXT: # xmm9 = mem[3,1,2,3]
1747 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
1748 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm9[0,1,2,0,4,5,6,7]
1749 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
1750 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm1[0],xmm5[1]
1751 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1752 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
1753 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1754 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1755 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
1756 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1757 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
1758 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
1759 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1760 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1761 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
1762 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1763 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
1764 ; SSE-NEXT: # xmm14 = mem[3,1,2,3]
1765 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
1766 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm14[0,1,2,0,4,5,6,7]
1767 ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
1768 ; SSE-NEXT: movsd {{.*#+}} xmm11 = xmm1[0],xmm11[1]
1769 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1770 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
1771 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1772 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1773 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
1774 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1775 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
1776 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm1[2,0,2,3,4,5,6,7]
1777 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
1778 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
1779 ; SSE-NEXT: # xmm12 = mem[3,1,2,3]
1780 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
1781 ; SSE-NEXT: # xmm10 = mem[3,1,2,3]
1782 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7]
1783 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm10[0,1,2,0,4,5,6,7]
1784 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
1785 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
1786 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
1787 ; SSE-NEXT: # xmm8 = mem[3,1,2,3]
1788 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
1789 ; SSE-NEXT: # xmm7 = mem[3,1,2,3]
1790 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[2,0,2,3,4,5,6,7]
1791 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7]
1792 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1793 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
1794 ; SSE-NEXT: # xmm6 = mem[3,1,2,3]
1795 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
1796 ; SSE-NEXT: # xmm3 = mem[3,1,2,3]
1797 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,2,0,4,5,6,7]
1798 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,2,0,4,5,6,7]
1799 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1800 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1801 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1802 ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7]
1803 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1804 ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7]
1805 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1806 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1807 ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
1808 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm9[0,1,3,1,4,5,6,7]
1809 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
1810 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1]
1811 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1812 ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7]
1813 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1814 ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7]
1815 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1816 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1817 ; SSE-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
1818 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[0,1,3,1,4,5,6,7]
1819 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
1820 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1]
1821 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1822 ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7]
1823 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
1824 ; SSE-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7]
1825 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1826 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm12[0,1,3,1,4,5,6,7]
1827 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7]
1828 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
1829 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm2[0],xmm10[1]
1830 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[3,1,2,3,4,5,6,7]
1831 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7]
1832 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
1833 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[0,1,3,1,4,5,6,7]
1834 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
1835 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
1836 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm2[0],xmm3[1]
1837 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1838 ; SSE-NEXT: movaps %xmm0, 48(%rsi)
1839 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1840 ; SSE-NEXT: movaps %xmm0, (%rsi)
1841 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1842 ; SSE-NEXT: movaps %xmm0, 32(%rsi)
1843 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1844 ; SSE-NEXT: movaps %xmm0, 16(%rsi)
1845 ; SSE-NEXT: movapd %xmm15, 48(%rdx)
1846 ; SSE-NEXT: movapd %xmm13, (%rdx)
1847 ; SSE-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
1848 ; SSE-NEXT: movaps %xmm0, 32(%rdx)
1849 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1850 ; SSE-NEXT: movaps %xmm0, 16(%rdx)
1851 ; SSE-NEXT: movapd %xmm1, 48(%rcx)
1852 ; SSE-NEXT: movapd %xmm4, 32(%rcx)
1853 ; SSE-NEXT: movapd %xmm11, 16(%rcx)
1854 ; SSE-NEXT: movapd %xmm5, (%rcx)
1855 ; SSE-NEXT: movapd %xmm3, 48(%r8)
1856 ; SSE-NEXT: movapd %xmm10, 32(%r8)
1857 ; SSE-NEXT: movapd %xmm14, 16(%r8)
1858 ; SSE-NEXT: movapd %xmm9, (%r8)
1859 ; SSE-NEXT: addq $248, %rsp
1862 ; AVX-LABEL: load_i16_stride4_vf32:
1864 ; AVX-NEXT: subq $280, %rsp # imm = 0x118
1865 ; AVX-NEXT: vpxor %xmm6, %xmm6, %xmm6
1866 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm4
1867 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm6[1,2,3],xmm4[4],xmm6[5,6,7]
1868 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1869 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm11
1870 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm11[0],xmm6[1,2,3],xmm11[4],xmm6[5,6,7]
1871 ; AVX-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
1872 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm3
1873 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm3[0],xmm6[1,2,3],xmm3[4],xmm6[5,6,7]
1874 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1875 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm0
1876 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1877 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7]
1878 ; AVX-NEXT: vpackusdw %xmm5, %xmm7, %xmm5
1879 ; AVX-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
1880 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1881 ; AVX-NEXT: vmovdqa (%rdi), %xmm10
1882 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm0
1883 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm1
1884 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm2
1885 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1886 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm2[0],xmm6[1,2,3],xmm2[4],xmm6[5,6,7]
1887 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm1[0],xmm6[1,2,3],xmm1[4],xmm6[5,6,7]
1888 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1889 ; AVX-NEXT: vpackusdw %xmm7, %xmm8, %xmm7
1890 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7]
1891 ; AVX-NEXT: vmovdqa %xmm0, %xmm2
1892 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1893 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0],xmm6[1,2,3],xmm10[4],xmm6[5,6,7]
1894 ; AVX-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1895 ; AVX-NEXT: vpackusdw %xmm8, %xmm9, %xmm8
1896 ; AVX-NEXT: vpackusdw %xmm7, %xmm8, %xmm7
1897 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1898 ; AVX-NEXT: vmovdqa 240(%rdi), %xmm0
1899 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1900 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7]
1901 ; AVX-NEXT: vmovdqa 224(%rdi), %xmm12
1902 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm12[0],xmm6[1,2,3],xmm12[4],xmm6[5,6,7]
1903 ; AVX-NEXT: vmovdqa %xmm12, (%rsp) # 16-byte Spill
1904 ; AVX-NEXT: vpackusdw %xmm7, %xmm8, %xmm7
1905 ; AVX-NEXT: vmovdqa 208(%rdi), %xmm5
1906 ; AVX-NEXT: vpblendw {{.*#+}} xmm8 = xmm5[0],xmm6[1,2,3],xmm5[4],xmm6[5,6,7]
1907 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1908 ; AVX-NEXT: vmovdqa 192(%rdi), %xmm0
1909 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm6[1,2,3],xmm0[4],xmm6[5,6,7]
1910 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1911 ; AVX-NEXT: vpackusdw %xmm8, %xmm9, %xmm8
1912 ; AVX-NEXT: vpackusdw %xmm7, %xmm8, %xmm7
1913 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1914 ; AVX-NEXT: vmovdqa 176(%rdi), %xmm14
1915 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm14[0],xmm6[1,2,3],xmm14[4],xmm6[5,6,7]
1916 ; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1917 ; AVX-NEXT: vmovdqa 160(%rdi), %xmm13
1918 ; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm13[0],xmm6[1,2,3],xmm13[4],xmm6[5,6,7]
1919 ; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1920 ; AVX-NEXT: vpackusdw %xmm7, %xmm15, %xmm7
1921 ; AVX-NEXT: vmovdqa 144(%rdi), %xmm8
1922 ; AVX-NEXT: vpblendw {{.*#+}} xmm15 = xmm8[0],xmm6[1,2,3],xmm8[4],xmm6[5,6,7]
1923 ; AVX-NEXT: vmovdqa %xmm8, %xmm9
1924 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1925 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm8
1926 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1927 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm8[0],xmm6[1,2,3],xmm8[4],xmm6[5,6,7]
1928 ; AVX-NEXT: vpackusdw %xmm15, %xmm6, %xmm6
1929 ; AVX-NEXT: vpackusdw %xmm7, %xmm6, %xmm6
1930 ; AVX-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1931 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm4[0,2,2,3]
1932 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
1933 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3]
1934 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
1935 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
1936 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[0,2,2,3]
1937 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
1938 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
1939 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm3[0,2,2,3]
1940 ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7]
1941 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1]
1942 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7]
1943 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
1944 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,2,2,3]
1945 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
1946 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm1[0,2,2,3]
1947 ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[0,1,1,3,4,5,6,7]
1948 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm15[0],xmm7[0],xmm15[1],xmm7[1]
1949 ; AVX-NEXT: vpshufd {{.*#+}} xmm15 = xmm2[0,2,2,3]
1950 ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7]
1951 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm10[0,2,2,3]
1952 ; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
1953 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1]
1954 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4,5,6,7]
1955 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
1956 ; AVX-NEXT: vblendps {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
1957 ; AVX-NEXT: vmovups %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1958 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
1959 ; AVX-NEXT: vpshufd {{.*#+}} xmm6 = xmm10[0,2,2,3]
1960 ; AVX-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
1961 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm12[0,2,2,3]
1962 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
1963 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
1964 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[0,2,2,3]
1965 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
1966 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[0,2,2,3]
1967 ; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
1968 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
1969 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1,2,3],xmm6[4,5,6,7]
1970 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm14[0,2,2,3]
1971 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
1972 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm13[0,2,2,3]
1973 ; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
1974 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
1975 ; AVX-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[0,2,2,3]
1976 ; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
1977 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
1978 ; AVX-NEXT: # xmm15 = mem[0,2,2,3]
1979 ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7]
1980 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm15[0],xmm8[0],xmm15[1],xmm8[1]
1981 ; AVX-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4,5,6,7]
1982 ; AVX-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
1983 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm6[4,5,6,7]
1984 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
1985 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1986 ; AVX-NEXT: # xmm0 = mem[3,1,2,3]
1987 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1988 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[3,1,2,3]
1989 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1990 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
1991 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
1992 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1993 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
1994 ; AVX-NEXT: # xmm1 = mem[3,1,2,3]
1995 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1996 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3]
1997 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1998 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
1999 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
2000 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2001 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2002 ; AVX-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[3,1,2,3]
2003 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
2004 ; AVX-NEXT: # xmm6 = mem[3,1,2,3]
2005 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,2,0,4,5,6,7]
2006 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,2,0,4,5,6,7]
2007 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2008 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
2009 ; AVX-NEXT: # xmm12 = mem[3,1,2,3]
2010 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2011 ; AVX-NEXT: # xmm5 = mem[3,1,2,3]
2012 ; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm12[2,0,2,3,4,5,6,7]
2013 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm5[2,0,2,3,4,5,6,7]
2014 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
2015 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm8[0,1,2,3],xmm1[4,5,6,7]
2016 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2017 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2018 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2019 ; AVX-NEXT: vpshufd {{.*#+}} xmm13 = xmm10[3,1,2,3]
2020 ; AVX-NEXT: vpshufd $231, (%rsp), %xmm14 # 16-byte Folded Reload
2021 ; AVX-NEXT: # xmm14 = mem[3,1,2,3]
2022 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm13[0,1,2,0,4,5,6,7]
2023 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm14[0,1,2,0,4,5,6,7]
2024 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2025 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
2026 ; AVX-NEXT: # xmm10 = mem[3,1,2,3]
2027 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
2028 ; AVX-NEXT: # xmm11 = mem[3,1,2,3]
2029 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[2,0,2,3,4,5,6,7]
2030 ; AVX-NEXT: vpshuflw {{.*#+}} xmm8 = xmm11[2,0,2,3,4,5,6,7]
2031 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
2032 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2033 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
2034 ; AVX-NEXT: # xmm8 = mem[3,1,2,3]
2035 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
2036 ; AVX-NEXT: # xmm9 = mem[3,1,2,3]
2037 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7]
2038 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,2,0,4,5,6,7]
2039 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2040 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
2041 ; AVX-NEXT: # xmm4 = mem[3,1,2,3]
2042 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2043 ; AVX-NEXT: # xmm2 = mem[3,1,2,3]
2044 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7]
2045 ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7]
2046 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
2047 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
2048 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1
2049 ; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2050 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2051 ; AVX-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
2052 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2053 ; AVX-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
2054 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2055 ; AVX-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2056 ; AVX-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
2057 ; AVX-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
2058 ; AVX-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
2059 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2060 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
2061 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,3,1,4,5,6,7]
2062 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7]
2063 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2064 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7]
2065 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
2066 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
2067 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
2068 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
2069 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2070 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,3,1,4,5,6,7]
2071 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7]
2072 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
2073 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[3,1,2,3,4,5,6,7]
2074 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7]
2075 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
2076 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
2077 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[0,1,3,1,4,5,6,7]
2078 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,3,1,4,5,6,7]
2079 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
2080 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
2081 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
2082 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
2083 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5,6,7]
2084 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
2085 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2086 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2087 ; AVX-NEXT: vmovaps %xmm2, 32(%rsi)
2088 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2089 ; AVX-NEXT: vmovaps %xmm2, 48(%rsi)
2090 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2091 ; AVX-NEXT: vmovaps %xmm2, (%rsi)
2092 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2093 ; AVX-NEXT: vmovaps %xmm2, 16(%rsi)
2094 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2095 ; AVX-NEXT: vmovaps %ymm2, 32(%rdx)
2096 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2097 ; AVX-NEXT: vmovaps %ymm2, (%rdx)
2098 ; AVX-NEXT: vmovaps %ymm15, 32(%rcx)
2099 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2100 ; AVX-NEXT: vmovaps %ymm2, (%rcx)
2101 ; AVX-NEXT: vmovaps %ymm1, 32(%r8)
2102 ; AVX-NEXT: vmovaps %ymm0, (%r8)
2103 ; AVX-NEXT: addq $280, %rsp # imm = 0x118
2104 ; AVX-NEXT: vzeroupper
2107 ; AVX2-LABEL: load_i16_stride4_vf32:
2109 ; AVX2-NEXT: subq $168, %rsp
2110 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
2111 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2112 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2113 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2114 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2115 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2116 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2117 ; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
2118 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2119 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2120 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2121 ; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
2122 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
2123 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2124 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
2125 ; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
2126 ; AVX2-NEXT: vpackusdw %xmm3, %xmm3, %xmm3
2127 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
2128 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
2129 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
2130 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2131 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2132 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
2133 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2134 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2135 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2136 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2137 ; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
2138 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2139 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
2140 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2141 ; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
2142 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
2143 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2144 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
2145 ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
2146 ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
2147 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
2148 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
2149 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2150 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2151 ; AVX2-NEXT: vmovdqa (%rdi), %xmm1
2152 ; AVX2-NEXT: vmovdqa %xmm1, (%rsp) # 16-byte Spill
2153 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm5
2154 ; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2155 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm6
2156 ; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2157 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm7
2158 ; AVX2-NEXT: vmovdqa 112(%rdi), %xmm9
2159 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[0,2,2,3]
2160 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
2161 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm13
2162 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[0,2,2,3]
2163 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
2164 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2165 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
2166 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm14
2167 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[0,2,2,3]
2168 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[1,3,2,3,4,5,6,7]
2169 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2
2170 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[0,2,2,3]
2171 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
2172 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2173 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
2174 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
2175 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[0,2,2,3]
2176 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
2177 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm6[0,2,2,3]
2178 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
2179 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2180 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,2,2,3]
2181 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
2182 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[0,2,2,3]
2183 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
2184 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
2185 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
2186 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
2187 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2188 ; AVX2-NEXT: vmovdqa 240(%rdi), %xmm0
2189 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2190 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2191 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
2192 ; AVX2-NEXT: vmovdqa 224(%rdi), %xmm10
2193 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm10[0,2,2,3]
2194 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
2195 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
2196 ; AVX2-NEXT: vmovdqa 208(%rdi), %xmm11
2197 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[0,2,2,3]
2198 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
2199 ; AVX2-NEXT: vmovdqa 192(%rdi), %xmm15
2200 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm15[0,2,2,3]
2201 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
2202 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2203 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
2204 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
2205 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
2206 ; AVX2-NEXT: vmovdqa 176(%rdi), %xmm12
2207 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3]
2208 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
2209 ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm4
2210 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
2211 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
2212 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
2213 ; AVX2-NEXT: vmovdqa 144(%rdi), %xmm3
2214 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,2,2,3]
2215 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
2216 ; AVX2-NEXT: vmovdqa 128(%rdi), %xmm6
2217 ; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2218 ; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
2219 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
2220 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
2221 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm5[0,1],xmm1[2,3]
2222 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2223 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2224 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm9[3,1,2,3]
2225 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2226 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3]
2227 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2228 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
2229 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
2230 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2231 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[3,1,2,3]
2232 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2233 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
2234 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2235 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
2236 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,0,2,3,4,5,6,7]
2237 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
2238 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
2239 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
2240 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3,4,5],ymm0[6,7]
2241 ; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[3,1,2,3]
2242 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
2243 ; AVX2-NEXT: # xmm9 = mem[3,1,2,3]
2244 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,2,0,4,5,6,7]
2245 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm9[0,1,2,0,4,5,6,7]
2246 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
2247 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
2248 ; AVX2-NEXT: # xmm7 = mem[3,1,2,3]
2249 ; AVX2-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload
2250 ; AVX2-NEXT: # xmm6 = mem[3,1,2,3]
2251 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7]
2252 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[2,0,2,3,4,5,6,7]
2253 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
2254 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2255 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
2256 ; AVX2-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
2257 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
2258 ; AVX2-NEXT: # xmm8 = mem[3,1,2,3]
2259 ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3]
2260 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[0,1,2,0,4,5,6,7]
2261 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7]
2262 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2263 ; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3]
2264 ; AVX2-NEXT: vpshufd {{.*#+}} xmm15 = xmm15[3,1,2,3]
2265 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm11[2,0,2,3,4,5,6,7]
2266 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[2,0,2,3,4,5,6,7]
2267 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2268 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
2269 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
2270 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm0[6,7]
2271 ; AVX2-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3]
2272 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[3,1,2,3]
2273 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm12[0,1,2,0,4,5,6,7]
2274 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7]
2275 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2276 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[3,1,2,3]
2277 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
2278 ; AVX2-NEXT: # xmm3 = mem[3,1,2,3]
2279 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7]
2280 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm3[2,0,2,3,4,5,6,7]
2281 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
2282 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
2283 ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2284 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2285 ; AVX2-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
2286 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2287 ; AVX2-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
2288 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2289 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2290 ; AVX2-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
2291 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
2292 ; AVX2-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7]
2293 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2294 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
2295 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
2296 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
2297 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[0,1,3,1,4,5,6,7]
2298 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,3,1,4,5,6,7]
2299 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2300 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7]
2301 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
2302 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
2303 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2304 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2305 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,3,1,4,5,6,7]
2306 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,1,3,1,4,5,6,7]
2307 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2308 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm11[3,1,2,3,4,5,6,7]
2309 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm15[3,1,2,3,4,5,6,7]
2310 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
2311 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
2312 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
2313 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
2314 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[0,1,3,1,4,5,6,7]
2315 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
2316 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
2317 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
2318 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
2319 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
2320 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
2321 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
2322 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2323 ; AVX2-NEXT: vmovaps %ymm2, 32(%rsi)
2324 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2325 ; AVX2-NEXT: vmovaps %ymm2, (%rsi)
2326 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2327 ; AVX2-NEXT: vmovaps %ymm2, 32(%rdx)
2328 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
2329 ; AVX2-NEXT: vmovaps %ymm2, (%rdx)
2330 ; AVX2-NEXT: vmovdqa %ymm14, 32(%rcx)
2331 ; AVX2-NEXT: vmovups (%rsp), %ymm2 # 32-byte Reload
2332 ; AVX2-NEXT: vmovaps %ymm2, (%rcx)
2333 ; AVX2-NEXT: vmovdqa %ymm1, 32(%r8)
2334 ; AVX2-NEXT: vmovdqa %ymm0, (%r8)
2335 ; AVX2-NEXT: addq $168, %rsp
2336 ; AVX2-NEXT: vzeroupper
2339 ; AVX2-FP-LABEL: load_i16_stride4_vf32:
2341 ; AVX2-FP-NEXT: subq $184, %rsp
2342 ; AVX2-FP-NEXT: vpxor %xmm0, %xmm0, %xmm0
2343 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2344 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
2345 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2346 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2347 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
2348 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2349 ; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
2350 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2351 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
2352 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2353 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
2354 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
2355 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2356 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
2357 ; AVX2-FP-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
2358 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm3, %xmm3
2359 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
2360 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
2361 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
2362 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2363 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2364 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
2365 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2366 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2367 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
2368 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2369 ; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
2370 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2371 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
2372 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
2373 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
2374 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
2375 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2376 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm3
2377 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
2378 ; AVX2-FP-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
2379 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
2380 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
2381 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
2382 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2383 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm13
2384 ; AVX2-FP-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2385 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm12
2386 ; AVX2-FP-NEXT: vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2387 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm14
2388 ; AVX2-FP-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2389 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm7
2390 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
2391 ; AVX2-FP-NEXT: vmovdqa 112(%rdi), %xmm2
2392 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm2, %xmm0
2393 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm4
2394 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm4, %xmm1
2395 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2396 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3
2397 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm11 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
2398 ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm5
2399 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm5, %xmm8
2400 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm6
2401 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm6, %xmm9
2402 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
2403 ; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
2404 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6,7]
2405 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm7, %xmm8
2406 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm14, %xmm9
2407 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
2408 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm12, %xmm9
2409 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm13, %xmm12
2410 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
2411 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
2412 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm3[4,5,6,7]
2413 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2414 ; AVX2-FP-NEXT: vmovdqa 240(%rdi), %xmm14
2415 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm14, %xmm9
2416 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %xmm8
2417 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm8, %xmm12
2418 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
2419 ; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm0
2420 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2421 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm0, %xmm12
2422 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm15
2423 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm15, %xmm13
2424 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
2425 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
2426 ; AVX2-FP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12
2427 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm12[0,1,2,3,4,5],ymm9[6,7]
2428 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm0
2429 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2430 ; AVX2-FP-NEXT: vmovdqa 176(%rdi), %xmm13
2431 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm13, %xmm12
2432 ; AVX2-FP-NEXT: vpshufb %xmm10, %xmm0, %xmm10
2433 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1]
2434 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm3
2435 ; AVX2-FP-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill
2436 ; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm12
2437 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm12, %xmm0
2438 ; AVX2-FP-NEXT: vpshufb %xmm11, %xmm3, %xmm11
2439 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
2440 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm10[2,3]
2441 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2442 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2443 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
2444 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2445 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,1,2,3]
2446 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2447 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
2448 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
2449 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2450 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[3,1,2,3]
2451 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2452 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm6[3,1,2,3]
2453 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2454 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
2455 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
2456 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2457 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
2458 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
2459 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7]
2460 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
2461 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
2462 ; AVX2-FP-NEXT: # xmm6 = mem[3,1,2,3]
2463 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[0,1,2,0,4,5,6,7]
2464 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[0,1,2,0,4,5,6,7]
2465 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
2466 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
2467 ; AVX2-FP-NEXT: # xmm5 = mem[3,1,2,3]
2468 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
2469 ; AVX2-FP-NEXT: # xmm4 = mem[3,1,2,3]
2470 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[2,0,2,3,4,5,6,7]
2471 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm4[2,0,2,3,4,5,6,7]
2472 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
2473 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
2474 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
2475 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2476 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3]
2477 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3]
2478 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm2[0,1,2,0,4,5,6,7]
2479 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm3[0,1,2,0,4,5,6,7]
2480 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
2481 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
2482 ; AVX2-FP-NEXT: # xmm8 = mem[3,1,2,3]
2483 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm14 = xmm15[3,1,2,3]
2484 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm8[2,0,2,3,4,5,6,7]
2485 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm14[2,0,2,3,4,5,6,7]
2486 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1]
2487 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
2488 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
2489 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm9[0,1,2,3,4,5],ymm0[6,7]
2490 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3]
2491 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
2492 ; AVX2-FP-NEXT: # xmm9 = mem[3,1,2,3]
2493 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm13[0,1,2,0,4,5,6,7]
2494 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm9[0,1,2,0,4,5,6,7]
2495 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1]
2496 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm15 = xmm12[3,1,2,3]
2497 ; AVX2-FP-NEXT: vpshufd $231, (%rsp), %xmm12 # 16-byte Folded Reload
2498 ; AVX2-FP-NEXT: # xmm12 = mem[3,1,2,3]
2499 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm15[2,0,2,3,4,5,6,7]
2500 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm10 = xmm12[2,0,2,3,4,5,6,7]
2501 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
2502 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm11[2,3]
2503 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2504 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2505 ; AVX2-FP-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
2506 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
2507 ; AVX2-FP-NEXT: # xmm10 = mem[0,1,3,1,4,5,6,7]
2508 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm10[0],xmm1[0],xmm10[1],xmm1[1]
2509 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
2510 ; AVX2-FP-NEXT: # xmm10 = mem[3,1,2,3,4,5,6,7]
2511 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
2512 ; AVX2-FP-NEXT: # xmm11 = mem[3,1,2,3,4,5,6,7]
2513 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
2514 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
2515 ; AVX2-FP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10
2516 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm1[6,7]
2517 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7]
2518 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
2519 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
2520 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
2521 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
2522 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
2523 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,3]
2524 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
2525 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
2526 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
2527 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2528 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7]
2529 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm14[3,1,2,3,4,5,6,7]
2530 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2531 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
2532 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
2533 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
2534 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[0,1,3,1,4,5,6,7]
2535 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,3,1,4,5,6,7]
2536 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2537 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[3,1,2,3,4,5,6,7]
2538 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[3,1,2,3,4,5,6,7]
2539 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
2540 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
2541 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
2542 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2543 ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rsi)
2544 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2545 ; AVX2-FP-NEXT: vmovaps %ymm3, (%rsi)
2546 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2547 ; AVX2-FP-NEXT: vmovaps %ymm3, 32(%rdx)
2548 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2549 ; AVX2-FP-NEXT: vmovaps %ymm3, (%rdx)
2550 ; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%rcx)
2551 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2552 ; AVX2-FP-NEXT: vmovaps %ymm0, (%rcx)
2553 ; AVX2-FP-NEXT: vmovdqa %ymm2, 32(%r8)
2554 ; AVX2-FP-NEXT: vmovdqa %ymm1, (%r8)
2555 ; AVX2-FP-NEXT: addq $184, %rsp
2556 ; AVX2-FP-NEXT: vzeroupper
2557 ; AVX2-FP-NEXT: retq
2559 ; AVX2-FCP-LABEL: load_i16_stride4_vf32:
2560 ; AVX2-FCP: # %bb.0:
2561 ; AVX2-FCP-NEXT: subq $104, %rsp
2562 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm3
2563 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2564 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm6
2565 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2566 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm9
2567 ; AVX2-FCP-NEXT: vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2568 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm7
2569 ; AVX2-FCP-NEXT: vpxor %xmm0, %xmm0, %xmm0
2570 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2571 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm2
2572 ; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
2573 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2574 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm4
2575 ; AVX2-FCP-NEXT: vpackusdw %xmm4, %xmm2, %xmm2
2576 ; AVX2-FCP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
2577 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [0,2,2,3,0,2,4,6]
2578 ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm2, %ymm5
2579 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29]
2580 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm8
2581 ; AVX2-FCP-NEXT: vpermd %ymm9, %ymm2, %ymm10
2582 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm9
2583 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
2584 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm8[4,5,6,7]
2585 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2586 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2587 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm8
2588 ; AVX2-FCP-NEXT: vpackusdw %xmm8, %xmm1, %xmm1
2589 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
2590 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm8
2591 ; AVX2-FCP-NEXT: vpackusdw %xmm8, %xmm0, %xmm0
2592 ; AVX2-FCP-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
2593 ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm2, %ymm9
2594 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm1
2595 ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm2, %ymm6
2596 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm8
2597 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
2598 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2599 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2600 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm14
2601 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm3
2602 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
2603 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm3, %xmm0
2604 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm14, %xmm8
2605 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
2606 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm13
2607 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm2
2608 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm1 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
2609 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm2, %xmm4
2610 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm13, %xmm11
2611 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm11[0],xmm4[0],xmm11[1],xmm4[1]
2612 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3]
2613 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm0 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
2614 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm5, %ymm5
2615 ; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm10
2616 ; AVX2-FCP-NEXT: vmovdqa %ymm0, %ymm8
2617 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4,5],ymm5[6,7]
2618 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3],ymm5[4,5,6,7]
2619 ; AVX2-FCP-NEXT: vmovdqu %ymm0, (%rsp) # 32-byte Spill
2620 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm11
2621 ; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm15
2622 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm15, %xmm4
2623 ; AVX2-FCP-NEXT: vpshufb %xmm12, %xmm11, %xmm10
2624 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1]
2625 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm10
2626 ; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm12
2627 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm12, %xmm0
2628 ; AVX2-FCP-NEXT: vpshufb %xmm1, %xmm10, %xmm1
2629 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
2630 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3]
2631 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm9, %ymm1
2632 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm4
2633 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5],ymm1[6,7]
2634 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
2635 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2636 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,3,2,3,1,3,5,7]
2637 ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm6, %ymm4
2638 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm7 # 32-byte Folded Reload
2639 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm5 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29]
2640 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm4, %ymm0
2641 ; AVX2-FCP-NEXT: vpshufb %ymm5, %ymm7, %ymm1
2642 ; AVX2-FCP-NEXT: vmovdqa %ymm5, %ymm8
2643 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
2644 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,1,2,3]
2645 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm14 = xmm14[3,1,2,3]
2646 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm1[0,1,2,0,4,5,6,7]
2647 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[0,1,2,0,4,5,6,7]
2648 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
2649 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
2650 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3]
2651 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[2,0,2,3,4,5,6,7]
2652 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm13[2,0,2,3,4,5,6,7]
2653 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
2654 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm5[0,1],xmm3[2,3]
2655 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
2656 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
2657 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload
2658 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm5 # 32-byte Folded Reload
2659 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm6
2660 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm9
2661 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
2662 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm9 = xmm15[3,1,2,3]
2663 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3]
2664 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[0,1,2,0,4,5,6,7]
2665 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm11[0,1,2,0,4,5,6,7]
2666 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1]
2667 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[3,1,2,3]
2668 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3]
2669 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm12[2,0,2,3,4,5,6,7]
2670 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm8 = xmm10[2,0,2,3,4,5,6,7]
2671 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1]
2672 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
2673 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm6[4,5,6,7]
2674 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
2675 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm4, %ymm4
2676 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm7, %ymm6
2677 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5],ymm4[6,7]
2678 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
2679 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm14[0,1,3,1,4,5,6,7]
2680 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
2681 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
2682 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm13[3,1,2,3,4,5,6,7]
2683 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
2684 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
2685 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm4[4,5,6,7]
2686 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm3, %ymm2
2687 ; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm3
2688 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
2689 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm9[0,1,3,1,4,5,6,7]
2690 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7]
2691 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2692 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7]
2693 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm10[3,1,2,3,4,5,6,7]
2694 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
2695 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
2696 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
2697 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2698 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rsi)
2699 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2700 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi)
2701 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
2702 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rdx)
2703 ; AVX2-FCP-NEXT: vmovups (%rsp), %ymm3 # 32-byte Reload
2704 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rdx)
2705 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 32(%rcx)
2706 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
2707 ; AVX2-FCP-NEXT: vmovaps %ymm0, (%rcx)
2708 ; AVX2-FCP-NEXT: vmovdqa %ymm2, 32(%r8)
2709 ; AVX2-FCP-NEXT: vmovdqa %ymm1, (%r8)
2710 ; AVX2-FCP-NEXT: addq $104, %rsp
2711 ; AVX2-FCP-NEXT: vzeroupper
2712 ; AVX2-FCP-NEXT: retq
2714 ; AVX512-LABEL: load_i16_stride4_vf32:
2716 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
2717 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm1
2718 ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm2
2719 ; AVX512-NEXT: vpmovqw %ymm2, %xmm2
2720 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
2721 ; AVX512-NEXT: vmovdqa 240(%rdi), %xmm6
2722 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
2723 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,0,2,4,5,6,7]
2724 ; AVX512-NEXT: vmovdqa 224(%rdi), %xmm7
2725 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[0,2,2,3]
2726 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,0,2,4,5,6,7]
2727 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2728 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
2729 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
2730 ; AVX512-NEXT: vpmovqw %zmm1, %xmm3
2731 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
2732 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm3
2733 ; AVX512-NEXT: vpmovqw %ymm3, %xmm3
2734 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm10
2735 ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm3
2736 ; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,2,2,3]
2737 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,1,0,2,4,5,6,7]
2738 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm4
2739 ; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,2,2,3]
2740 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[0,1,0,2,4,5,6,7]
2741 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
2742 ; AVX512-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2743 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
2744 ; AVX512-NEXT: vpmovqw %zmm0, %xmm11
2745 ; AVX512-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
2746 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm10[0,1,2,3],zmm2[0,1,2,3]
2747 ; AVX512-NEXT: vmovdqa 64(%rdi), %xmm10
2748 ; AVX512-NEXT: vmovdqa 80(%rdi), %xmm11
2749 ; AVX512-NEXT: vmovdqa 192(%rdi), %xmm13
2750 ; AVX512-NEXT: vmovdqa 208(%rdi), %xmm14
2751 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
2752 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
2753 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
2754 ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
2755 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3]
2756 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
2757 ; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[0,2,2,3]
2758 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7]
2759 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1]
2760 ; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
2761 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
2762 ; AVX512-NEXT: vpsrlq $16, %zmm1, %zmm9
2763 ; AVX512-NEXT: vpmovqw %zmm9, %xmm9
2764 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
2765 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
2766 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm12[0,1,1,3,4,5,6,7]
2767 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
2768 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
2769 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,2,2,3]
2770 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
2771 ; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,2,2,3]
2772 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
2773 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
2774 ; AVX512-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
2775 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
2776 ; AVX512-NEXT: vpsrlq $16, %zmm0, %zmm9
2777 ; AVX512-NEXT: vpmovqw %zmm9, %xmm9
2778 ; AVX512-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
2779 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[0,1,2,3]
2780 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
2781 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,2,0,4,5,6,7]
2782 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
2783 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,2,0,4,5,6,7]
2784 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
2785 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm12
2786 ; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm14[3,1,2,3]
2787 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm8[2,0,2,3,4,5,6,7]
2788 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[3,1,2,3]
2789 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[2,0,2,3,4,5,6,7]
2790 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1]
2791 ; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
2792 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
2793 ; AVX512-NEXT: vpsrlq $32, %zmm1, %zmm13
2794 ; AVX512-NEXT: vpmovqw %zmm13, %xmm13
2795 ; AVX512-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
2796 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
2797 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[0,1,2,0,4,5,6,7]
2798 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
2799 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[0,1,2,0,4,5,6,7]
2800 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
2801 ; AVX512-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
2802 ; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3]
2803 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm11[2,0,2,3,4,5,6,7]
2804 ; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3]
2805 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[2,0,2,3,4,5,6,7]
2806 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
2807 ; AVX512-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
2808 ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
2809 ; AVX512-NEXT: vpsrlq $32, %zmm0, %zmm14
2810 ; AVX512-NEXT: vpmovqw %zmm14, %xmm14
2811 ; AVX512-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
2812 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm13[0,1,2,3],zmm12[0,1,2,3]
2813 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
2814 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7]
2815 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
2816 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
2817 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7]
2818 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[3,1,2,3,4,5,6,7]
2819 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
2820 ; AVX512-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
2821 ; AVX512-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
2822 ; AVX512-NEXT: vpsrlq $48, %zmm1, %zmm1
2823 ; AVX512-NEXT: vpmovqw %zmm1, %xmm1
2824 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
2825 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
2826 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
2827 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2828 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
2829 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[3,1,2,3,4,5,6,7]
2830 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7]
2831 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
2832 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
2833 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
2834 ; AVX512-NEXT: vpsrlq $48, %zmm0, %zmm0
2835 ; AVX512-NEXT: vpmovqw %zmm0, %xmm0
2836 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
2837 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,2,3]
2838 ; AVX512-NEXT: vmovdqa64 %zmm2, (%rsi)
2839 ; AVX512-NEXT: vmovdqa64 %zmm5, (%rdx)
2840 ; AVX512-NEXT: vmovdqa64 %zmm12, (%rcx)
2841 ; AVX512-NEXT: vmovdqa64 %zmm0, (%r8)
2842 ; AVX512-NEXT: vzeroupper
2845 ; AVX512-FCP-LABEL: load_i16_stride4_vf32:
2846 ; AVX512-FCP: # %bb.0:
2847 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
2848 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
2849 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
2850 ; AVX512-FCP-NEXT: # ymm2 = mem[0,1,0,1]
2851 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6]
2852 ; AVX512-FCP-NEXT: vmovdqa 224(%rdi), %ymm5
2853 ; AVX512-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm6
2854 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm7
2855 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
2856 ; AVX512-FCP-NEXT: vmovdqa 192(%rdi), %ymm8
2857 ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm9
2858 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm10
2859 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,2,2,3,4,6,12,14]
2860 ; AVX512-FCP-NEXT: vpermt2d %ymm7, %ymm11, %ymm10
2861 ; AVX512-FCP-NEXT: vpmovqw %zmm1, %xmm7
2862 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
2863 ; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm10
2864 ; AVX512-FCP-NEXT: vpermd %ymm10, %ymm4, %ymm12
2865 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm12, %ymm13
2866 ; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm14
2867 ; AVX512-FCP-NEXT: vpermd %ymm14, %ymm4, %ymm15
2868 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm15, %ymm4
2869 ; AVX512-FCP-NEXT: vpermt2d %ymm13, %ymm11, %ymm4
2870 ; AVX512-FCP-NEXT: vpmovqw %zmm0, %xmm13
2871 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
2872 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm4[0,1,2,3],zmm7[0,1,2,3]
2873 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
2874 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6
2875 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm9
2876 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
2877 ; AVX512-FCP-NEXT: vpsrlq $16, %zmm1, %zmm9
2878 ; AVX512-FCP-NEXT: vpmovqw %zmm9, %xmm9
2879 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
2880 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm9
2881 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm15, %ymm12
2882 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7]
2883 ; AVX512-FCP-NEXT: vpsrlq $16, %zmm0, %zmm12
2884 ; AVX512-FCP-NEXT: vpmovqw %zmm12, %xmm12
2885 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
2886 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[0,1,2,3]
2887 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,2,3,1,3,5,7]
2888 ; AVX512-FCP-NEXT: vpermd %ymm5, %ymm9, %ymm5
2889 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm12
2890 ; AVX512-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8
2891 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm13
2892 ; AVX512-FCP-NEXT: vpermt2d %ymm12, %ymm11, %ymm13
2893 ; AVX512-FCP-NEXT: vpsrlq $32, %zmm1, %zmm12
2894 ; AVX512-FCP-NEXT: vpmovqw %zmm12, %xmm12
2895 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
2896 ; AVX512-FCP-NEXT: vpermd %ymm10, %ymm9, %ymm10
2897 ; AVX512-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2
2898 ; AVX512-FCP-NEXT: vpermd %ymm14, %ymm9, %ymm9
2899 ; AVX512-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm3
2900 ; AVX512-FCP-NEXT: vpermt2d %ymm2, %ymm11, %ymm3
2901 ; AVX512-FCP-NEXT: vpsrlq $32, %zmm0, %zmm2
2902 ; AVX512-FCP-NEXT: vpmovqw %zmm2, %xmm2
2903 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
2904 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[0,1,2,3]
2905 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm3
2906 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm5
2907 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
2908 ; AVX512-FCP-NEXT: vpsrlq $48, %zmm1, %zmm1
2909 ; AVX512-FCP-NEXT: vpmovqw %zmm1, %xmm1
2910 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
2911 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm3
2912 ; AVX512-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4
2913 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
2914 ; AVX512-FCP-NEXT: vpsrlq $48, %zmm0, %zmm0
2915 ; AVX512-FCP-NEXT: vpmovqw %zmm0, %xmm0
2916 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
2917 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,2,3]
2918 ; AVX512-FCP-NEXT: vmovdqa64 %zmm7, (%rsi)
2919 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%rdx)
2920 ; AVX512-FCP-NEXT: vmovdqa64 %zmm2, (%rcx)
2921 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
2922 ; AVX512-FCP-NEXT: vzeroupper
2923 ; AVX512-FCP-NEXT: retq
2925 ; AVX512DQ-LABEL: load_i16_stride4_vf32:
2926 ; AVX512DQ: # %bb.0:
2927 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm0
2928 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm1
2929 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm2
2930 ; AVX512DQ-NEXT: vpmovqw %ymm2, %xmm2
2931 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
2932 ; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm6
2933 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
2934 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,0,2,4,5,6,7]
2935 ; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm7
2936 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm7[0,2,2,3]
2937 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[0,1,0,2,4,5,6,7]
2938 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2939 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
2940 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
2941 ; AVX512DQ-NEXT: vpmovqw %zmm1, %xmm3
2942 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
2943 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm3
2944 ; AVX512DQ-NEXT: vpmovqw %ymm3, %xmm3
2945 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm10
2946 ; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm3
2947 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[0,2,2,3]
2948 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm11 = xmm8[0,1,0,2,4,5,6,7]
2949 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm4
2950 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm4[0,2,2,3]
2951 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm12[0,1,0,2,4,5,6,7]
2952 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
2953 ; AVX512DQ-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11
2954 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm10[0,1,2,3,4,5],ymm11[6,7]
2955 ; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm11
2956 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
2957 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm10[0,1,2,3],zmm2[0,1,2,3]
2958 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm10
2959 ; AVX512DQ-NEXT: vmovdqa 80(%rdi), %xmm11
2960 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm13
2961 ; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm14
2962 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
2963 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
2964 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm9[0],xmm5[0],xmm9[1],xmm5[1]
2965 ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm5
2966 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3]
2967 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
2968 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm13[0,2,2,3]
2969 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm15[1,3,2,3,4,5,6,7]
2970 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1]
2971 ; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
2972 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3,4,5],ymm5[6,7]
2973 ; AVX512DQ-NEXT: vpsrlq $16, %zmm1, %zmm9
2974 ; AVX512DQ-NEXT: vpmovqw %zmm9, %xmm9
2975 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm9[0,1,2,3],ymm5[4,5,6,7]
2976 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
2977 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm12[0,1,1,3,4,5,6,7]
2978 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
2979 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
2980 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm11[0,2,2,3]
2981 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
2982 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm10[0,2,2,3]
2983 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
2984 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm12[0],xmm9[0],xmm12[1],xmm9[1]
2985 ; AVX512DQ-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
2986 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
2987 ; AVX512DQ-NEXT: vpsrlq $16, %zmm0, %zmm9
2988 ; AVX512DQ-NEXT: vpmovqw %zmm9, %xmm9
2989 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
2990 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[0,1,2,3]
2991 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
2992 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm6[0,1,2,0,4,5,6,7]
2993 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
2994 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm7[0,1,2,0,4,5,6,7]
2995 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
2996 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm12
2997 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm14[3,1,2,3]
2998 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm8[2,0,2,3,4,5,6,7]
2999 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[3,1,2,3]
3000 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm9[2,0,2,3,4,5,6,7]
3001 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1]
3002 ; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
3003 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3,4,5],ymm12[6,7]
3004 ; AVX512DQ-NEXT: vpsrlq $32, %zmm1, %zmm13
3005 ; AVX512DQ-NEXT: vpmovqw %zmm13, %xmm13
3006 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm12 = ymm13[0,1,2,3],ymm12[4,5,6,7]
3007 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
3008 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm13 = xmm3[0,1,2,0,4,5,6,7]
3009 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[3,1,2,3]
3010 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm4[0,1,2,0,4,5,6,7]
3011 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
3012 ; AVX512DQ-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
3013 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[3,1,2,3]
3014 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm11[2,0,2,3,4,5,6,7]
3015 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm10[3,1,2,3]
3016 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm15 = xmm10[2,0,2,3,4,5,6,7]
3017 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
3018 ; AVX512DQ-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14
3019 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5],ymm13[6,7]
3020 ; AVX512DQ-NEXT: vpsrlq $32, %zmm0, %zmm14
3021 ; AVX512DQ-NEXT: vpmovqw %zmm14, %xmm14
3022 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3],ymm13[4,5,6,7]
3023 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm13[0,1,2,3],zmm12[0,1,2,3]
3024 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,3,1,4,5,6,7]
3025 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7]
3026 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
3027 ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
3028 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm8[3,1,2,3,4,5,6,7]
3029 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm9[3,1,2,3,4,5,6,7]
3030 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
3031 ; AVX512DQ-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
3032 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
3033 ; AVX512DQ-NEXT: vpsrlq $48, %zmm1, %zmm1
3034 ; AVX512DQ-NEXT: vpmovqw %zmm1, %xmm1
3035 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm6[4,5,6,7]
3036 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
3037 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
3038 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
3039 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
3040 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[3,1,2,3,4,5,6,7]
3041 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7]
3042 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1]
3043 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
3044 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
3045 ; AVX512DQ-NEXT: vpsrlq $48, %zmm0, %zmm0
3046 ; AVX512DQ-NEXT: vpmovqw %zmm0, %xmm0
3047 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
3048 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,2,3]
3049 ; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rsi)
3050 ; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rdx)
3051 ; AVX512DQ-NEXT: vmovdqa64 %zmm12, (%rcx)
3052 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8)
3053 ; AVX512DQ-NEXT: vzeroupper
3054 ; AVX512DQ-NEXT: retq
3056 ; AVX512DQ-FCP-LABEL: load_i16_stride4_vf32:
3057 ; AVX512DQ-FCP: # %bb.0:
3058 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
3059 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm1
3060 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
3061 ; AVX512DQ-FCP-NEXT: # ymm2 = mem[0,1,0,1]
3062 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,2,2,3,0,2,4,6]
3063 ; AVX512DQ-FCP-NEXT: vmovdqa 224(%rdi), %ymm5
3064 ; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm4, %ymm6
3065 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm6, %ymm7
3066 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
3067 ; AVX512DQ-FCP-NEXT: vmovdqa 192(%rdi), %ymm8
3068 ; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm4, %ymm9
3069 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm10
3070 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm11 = [0,2,2,3,4,6,12,14]
3071 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm7, %ymm11, %ymm10
3072 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm1, %xmm7
3073 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm10[4,5,6,7]
3074 ; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm10
3075 ; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm4, %ymm12
3076 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm12, %ymm13
3077 ; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm14
3078 ; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm4, %ymm15
3079 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm15, %ymm4
3080 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm13, %ymm11, %ymm4
3081 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm0, %xmm13
3082 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm13[0,1,2,3],ymm4[4,5,6,7]
3083 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm7 = zmm4[0,1,2,3],zmm7[0,1,2,3]
3084 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
3085 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm6
3086 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm9
3087 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3,4,5],ymm6[6,7]
3088 ; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm1, %zmm9
3089 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm9, %xmm9
3090 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm9[0,1,2,3],ymm6[4,5,6,7]
3091 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm9
3092 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm15, %ymm12
3093 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3,4,5],ymm9[6,7]
3094 ; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm0, %zmm12
3095 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm12, %xmm12
3096 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm12[0,1,2,3],ymm9[4,5,6,7]
3097 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm9[0,1,2,3],zmm6[0,1,2,3]
3098 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,3,2,3,1,3,5,7]
3099 ; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm9, %ymm5
3100 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm5, %ymm12
3101 ; AVX512DQ-FCP-NEXT: vpermd %ymm8, %ymm9, %ymm8
3102 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm8, %ymm13
3103 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm12, %ymm11, %ymm13
3104 ; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm1, %zmm12
3105 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm12, %xmm12
3106 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
3107 ; AVX512DQ-FCP-NEXT: vpermd %ymm10, %ymm9, %ymm10
3108 ; AVX512DQ-FCP-NEXT: vpshufb %ymm2, %ymm10, %ymm2
3109 ; AVX512DQ-FCP-NEXT: vpermd %ymm14, %ymm9, %ymm9
3110 ; AVX512DQ-FCP-NEXT: vpshufb %ymm3, %ymm9, %ymm3
3111 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm2, %ymm11, %ymm3
3112 ; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm0, %zmm2
3113 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm2, %xmm2
3114 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
3115 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[0,1,2,3]
3116 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm3
3117 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm5
3118 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
3119 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm1, %zmm1
3120 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm1, %xmm1
3121 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
3122 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm10, %ymm3
3123 ; AVX512DQ-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm4
3124 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
3125 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm0, %zmm0
3126 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm0, %xmm0
3127 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
3128 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[0,1,2,3]
3129 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, (%rsi)
3130 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%rdx)
3131 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, (%rcx)
3132 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
3133 ; AVX512DQ-FCP-NEXT: vzeroupper
3134 ; AVX512DQ-FCP-NEXT: retq
3136 ; AVX512BW-LABEL: load_i16_stride4_vf32:
3137 ; AVX512BW: # %bb.0:
3138 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
3139 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
3140 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2
3141 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3
3142 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
3143 ; AVX512BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
3144 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm5
3145 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm4, %zmm5
3146 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
3147 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
3148 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
3149 ; AVX512BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
3150 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm6
3151 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm5, %zmm6
3152 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
3153 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
3154 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
3155 ; AVX512BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
3156 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm7
3157 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm7
3158 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
3159 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7]
3160 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
3161 ; AVX512BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
3162 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm7, %zmm2
3163 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm7, %zmm0
3164 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
3165 ; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rsi)
3166 ; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rdx)
3167 ; AVX512BW-NEXT: vmovdqa64 %zmm6, (%rcx)
3168 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r8)
3169 ; AVX512BW-NEXT: vzeroupper
3170 ; AVX512BW-NEXT: retq
3172 ; AVX512BW-FCP-LABEL: load_i16_stride4_vf32:
3173 ; AVX512BW-FCP: # %bb.0:
3174 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
3175 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
3176 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
3177 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
3178 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
3179 ; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
3180 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
3181 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm4, %zmm5
3182 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
3183 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
3184 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
3185 ; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
3186 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6
3187 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm5, %zmm6
3188 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
3189 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
3190 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
3191 ; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
3192 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7
3193 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm7
3194 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
3195 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7]
3196 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
3197 ; AVX512BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
3198 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm7, %zmm2
3199 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm7, %zmm0
3200 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
3201 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi)
3202 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
3203 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, (%rcx)
3204 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
3205 ; AVX512BW-FCP-NEXT: vzeroupper
3206 ; AVX512BW-FCP-NEXT: retq
3208 ; AVX512DQ-BW-LABEL: load_i16_stride4_vf32:
3209 ; AVX512DQ-BW: # %bb.0:
3210 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
3211 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
3212 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2
3213 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3
3214 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
3215 ; AVX512DQ-BW-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
3216 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm5
3217 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm4, %zmm5
3218 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
3219 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
3220 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
3221 ; AVX512DQ-BW-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
3222 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm6
3223 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm5, %zmm6
3224 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
3225 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
3226 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
3227 ; AVX512DQ-BW-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
3228 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm7
3229 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm6, %zmm7
3230 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
3231 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7]
3232 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
3233 ; AVX512DQ-BW-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
3234 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm7, %zmm2
3235 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm7, %zmm0
3236 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
3237 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rsi)
3238 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rdx)
3239 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, (%rcx)
3240 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r8)
3241 ; AVX512DQ-BW-NEXT: vzeroupper
3242 ; AVX512DQ-BW-NEXT: retq
3244 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride4_vf32:
3245 ; AVX512DQ-BW-FCP: # %bb.0:
3246 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
3247 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
3248 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
3249 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
3250 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
3251 ; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3]
3252 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm5
3253 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm4, %zmm5
3254 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm4
3255 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
3256 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
3257 ; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3]
3258 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm6
3259 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm5, %zmm6
3260 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm5
3261 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
3262 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm6 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
3263 ; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3]
3264 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm7
3265 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm6, %zmm7
3266 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm6
3267 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm7[4,5,6,7]
3268 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
3269 ; AVX512DQ-BW-FCP-NEXT: # zmm7 = mem[0,1,2,3,0,1,2,3]
3270 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm7, %zmm2
3271 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm7, %zmm0
3272 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
3273 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rsi)
3274 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rdx)
3275 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, (%rcx)
3276 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
3277 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
3278 ; AVX512DQ-BW-FCP-NEXT: retq
3279 %wide.vec = load <128 x i16>, ptr %in.vec, align 64
3280 %strided.vec0 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 64, i32 68, i32 72, i32 76, i32 80, i32 84, i32 88, i32 92, i32 96, i32 100, i32 104, i32 108, i32 112, i32 116, i32 120, i32 124>
3281 %strided.vec1 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <32 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 65, i32 69, i32 73, i32 77, i32 81, i32 85, i32 89, i32 93, i32 97, i32 101, i32 105, i32 109, i32 113, i32 117, i32 121, i32 125>
3282 %strided.vec2 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <32 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 66, i32 70, i32 74, i32 78, i32 82, i32 86, i32 90, i32 94, i32 98, i32 102, i32 106, i32 110, i32 114, i32 118, i32 122, i32 126>
3283 %strided.vec3 = shufflevector <128 x i16> %wide.vec, <128 x i16> poison, <32 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63, i32 67, i32 71, i32 75, i32 79, i32 83, i32 87, i32 91, i32 95, i32 99, i32 103, i32 107, i32 111, i32 115, i32 119, i32 123, i32 127>
3284 store <32 x i16> %strided.vec0, ptr %out.vec0, align 64
3285 store <32 x i16> %strided.vec1, ptr %out.vec1, align 64
3286 store <32 x i16> %strided.vec2, ptr %out.vec2, align 64
3287 store <32 x i16> %strided.vec3, ptr %out.vec3, align 64
3291 define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
3292 ; SSE-LABEL: load_i16_stride4_vf64:
3294 ; SSE-NEXT: subq $824, %rsp # imm = 0x338
3295 ; SSE-NEXT: movdqa 352(%rdi), %xmm3
3296 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3297 ; SSE-NEXT: movdqa 320(%rdi), %xmm4
3298 ; SSE-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3299 ; SSE-NEXT: movdqa 336(%rdi), %xmm5
3300 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3301 ; SSE-NEXT: movdqa 96(%rdi), %xmm2
3302 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3303 ; SSE-NEXT: movdqa 112(%rdi), %xmm6
3304 ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3305 ; SSE-NEXT: movdqa 64(%rdi), %xmm1
3306 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3307 ; SSE-NEXT: movdqa 80(%rdi), %xmm0
3308 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3309 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3310 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3311 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3312 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3313 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3314 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3315 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3316 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
3317 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3318 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3319 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
3320 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3321 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
3322 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3323 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3324 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3325 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm5[0,2,2,3]
3326 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3327 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3328 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
3329 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3330 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3331 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3332 ; SSE-NEXT: movdqa 368(%rdi), %xmm0
3333 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3334 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3335 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3336 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3337 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
3338 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3339 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
3340 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3341 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3342 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3343 ; SSE-NEXT: movdqa (%rdi), %xmm1
3344 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3345 ; SSE-NEXT: movdqa 16(%rdi), %xmm0
3346 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3347 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3348 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3349 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3350 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3351 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3352 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3353 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3354 ; SSE-NEXT: movdqa 32(%rdi), %xmm2
3355 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3356 ; SSE-NEXT: movdqa 48(%rdi), %xmm0
3357 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3358 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3359 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3360 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3361 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
3362 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3363 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
3364 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3365 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3366 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3367 ; SSE-NEXT: movdqa 256(%rdi), %xmm1
3368 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3369 ; SSE-NEXT: movdqa 272(%rdi), %xmm0
3370 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3371 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3372 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3373 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3374 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3375 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3376 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3377 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3378 ; SSE-NEXT: movdqa 288(%rdi), %xmm2
3379 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3380 ; SSE-NEXT: movdqa 304(%rdi), %xmm0
3381 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3382 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3383 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3384 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3385 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
3386 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3387 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,2,4,5,6,7]
3388 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3389 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3390 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3391 ; SSE-NEXT: movdqa 192(%rdi), %xmm1
3392 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3393 ; SSE-NEXT: movdqa 208(%rdi), %xmm0
3394 ; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
3395 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3396 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3397 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
3398 ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
3399 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3400 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,3,4,5,6,7]
3401 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3402 ; SSE-NEXT: movdqa 224(%rdi), %xmm2
3403 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3404 ; SSE-NEXT: movdqa 240(%rdi), %xmm0
3405 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3406 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
3407 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3408 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,2,4,5,6,7]
3409 ; SSE-NEXT: pshufd {{.*#+}} xmm14 = xmm2[0,2,2,3]
3410 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,1,0,2,4,5,6,7]
3411 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3412 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3413 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3414 ; SSE-NEXT: movdqa 448(%rdi), %xmm1
3415 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3416 ; SSE-NEXT: movdqa 464(%rdi), %xmm0
3417 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3418 ; SSE-NEXT: pshufd {{.*#+}} xmm13 = xmm0[0,2,2,3]
3419 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,2,2,3,4,5,6,7]
3420 ; SSE-NEXT: pshufd {{.*#+}} xmm12 = xmm1[0,2,2,3]
3421 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[0,2,2,3,4,5,6,7]
3422 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3423 ; SSE-NEXT: movdqa 480(%rdi), %xmm2
3424 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3425 ; SSE-NEXT: movdqa 496(%rdi), %xmm0
3426 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3427 ; SSE-NEXT: pshufd {{.*#+}} xmm11 = xmm0[0,2,2,3]
3428 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,0,2,4,5,6,7]
3429 ; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm2[0,2,2,3]
3430 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm10[0,1,0,2,4,5,6,7]
3431 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3432 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3433 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3434 ; SSE-NEXT: movdqa 128(%rdi), %xmm1
3435 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3436 ; SSE-NEXT: movdqa 144(%rdi), %xmm0
3437 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3438 ; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm0[0,2,2,3]
3439 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[0,2,2,3,4,5,6,7]
3440 ; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm1[0,2,2,3]
3441 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,2,2,3,4,5,6,7]
3442 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3443 ; SSE-NEXT: movdqa 160(%rdi), %xmm2
3444 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3445 ; SSE-NEXT: movdqa 176(%rdi), %xmm0
3446 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3447 ; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm0[0,2,2,3]
3448 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,0,2,4,5,6,7]
3449 ; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm2[0,2,2,3]
3450 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm6[0,1,0,2,4,5,6,7]
3451 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3452 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3453 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3454 ; SSE-NEXT: movdqa 384(%rdi), %xmm0
3455 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3456 ; SSE-NEXT: movdqa 400(%rdi), %xmm1
3457 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3458 ; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3]
3459 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[0,2,2,3,4,5,6,7]
3460 ; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm0[0,2,2,3]
3461 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,2,2,3,4,5,6,7]
3462 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3463 ; SSE-NEXT: movdqa 416(%rdi), %xmm2
3464 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3465 ; SSE-NEXT: movdqa 432(%rdi), %xmm1
3466 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3467 ; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
3468 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm3[0,1,0,2,4,5,6,7]
3469 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
3470 ; SSE-NEXT: pshuflw {{.*#+}} xmm15 = xmm2[0,1,0,2,4,5,6,7]
3471 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1]
3472 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm0[0],xmm15[1]
3473 ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3474 ; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3475 ; SSE-NEXT: # xmm0 = mem[1,3,2,3,4,5,6,7]
3476 ; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3477 ; SSE-NEXT: # xmm1 = mem[1,3,2,3,4,5,6,7]
3478 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3479 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3480 ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7]
3481 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3482 ; SSE-NEXT: # xmm15 = mem[0,1,1,3,4,5,6,7]
3483 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
3484 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1]
3485 ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3486 ; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3487 ; SSE-NEXT: # xmm0 = mem[1,3,2,3,4,5,6,7]
3488 ; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3489 ; SSE-NEXT: # xmm1 = mem[1,3,2,3,4,5,6,7]
3490 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3491 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3492 ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7]
3493 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3494 ; SSE-NEXT: # xmm15 = mem[0,1,1,3,4,5,6,7]
3495 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
3496 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1]
3497 ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3498 ; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3499 ; SSE-NEXT: # xmm0 = mem[1,3,2,3,4,5,6,7]
3500 ; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3501 ; SSE-NEXT: # xmm1 = mem[1,3,2,3,4,5,6,7]
3502 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3503 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3504 ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7]
3505 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3506 ; SSE-NEXT: # xmm15 = mem[0,1,1,3,4,5,6,7]
3507 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
3508 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1]
3509 ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3510 ; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3511 ; SSE-NEXT: # xmm0 = mem[1,3,2,3,4,5,6,7]
3512 ; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3513 ; SSE-NEXT: # xmm1 = mem[1,3,2,3,4,5,6,7]
3514 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3515 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3516 ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7]
3517 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3518 ; SSE-NEXT: # xmm15 = mem[0,1,1,3,4,5,6,7]
3519 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
3520 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm1[0],xmm15[1]
3521 ; SSE-NEXT: movapd %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3522 ; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3523 ; SSE-NEXT: # xmm0 = mem[1,3,2,3,4,5,6,7]
3524 ; SSE-NEXT: pshuflw $237, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3525 ; SSE-NEXT: # xmm1 = mem[1,3,2,3,4,5,6,7]
3526 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3527 ; SSE-NEXT: pshuflw $212, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3528 ; SSE-NEXT: # xmm0 = mem[0,1,1,3,4,5,6,7]
3529 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm14[0,1,1,3,4,5,6,7]
3530 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
3531 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm1[0],xmm14[1]
3532 ; SSE-NEXT: movapd %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3533 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[1,3,2,3,4,5,6,7]
3534 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm12[1,3,2,3,4,5,6,7]
3535 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3536 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,1,1,3,4,5,6,7]
3537 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
3538 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
3539 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm1[0],xmm10[1]
3540 ; SSE-NEXT: movapd %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3541 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm9[1,3,2,3,4,5,6,7]
3542 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[1,3,2,3,4,5,6,7]
3543 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3544 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[0,1,1,3,4,5,6,7]
3545 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
3546 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
3547 ; SSE-NEXT: movsd {{.*#+}} xmm6 = xmm1[0],xmm6[1]
3548 ; SSE-NEXT: movapd %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3549 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm5[1,3,2,3,4,5,6,7]
3550 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[1,3,2,3,4,5,6,7]
3551 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3552 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm3[0,1,1,3,4,5,6,7]
3553 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
3554 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3555 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3556 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3557 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3558 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
3559 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3560 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3561 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
3562 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3563 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
3564 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
3565 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3566 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3567 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
3568 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3569 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3570 ; SSE-NEXT: # xmm2 = mem[3,1,2,3]
3571 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3572 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
3573 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
3574 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3575 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3576 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3577 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3578 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
3579 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3580 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3581 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
3582 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3583 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
3584 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
3585 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3586 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3587 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
3588 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3589 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3590 ; SSE-NEXT: # xmm2 = mem[3,1,2,3]
3591 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3592 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
3593 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
3594 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3595 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
3596 ; SSE-NEXT: movapd %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3597 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3598 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
3599 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3600 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3601 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
3602 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3603 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
3604 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7]
3605 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3606 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
3607 ; SSE-NEXT: # xmm13 = mem[3,1,2,3]
3608 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
3609 ; SSE-NEXT: # xmm8 = mem[3,1,2,3]
3610 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm13[0,1,2,0,4,5,6,7]
3611 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7]
3612 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3613 ; SSE-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
3614 ; SSE-NEXT: movapd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3615 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3616 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
3617 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3618 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3619 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
3620 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3621 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
3622 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7]
3623 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3624 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3625 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
3626 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3627 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3628 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
3629 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3630 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
3631 ; SSE-NEXT: pshuflw {{.*#+}} xmm14 = xmm1[0,1,2,0,4,5,6,7]
3632 ; SSE-NEXT: punpckldq {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
3633 ; SSE-NEXT: movsd {{.*#+}} xmm14 = xmm2[0],xmm14[1]
3634 ; SSE-NEXT: pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload
3635 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
3636 ; SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
3637 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3638 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
3639 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3640 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
3641 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7]
3642 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3643 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3644 ; SSE-NEXT: # xmm15 = mem[3,1,2,3]
3645 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
3646 ; SSE-NEXT: # xmm12 = mem[3,1,2,3]
3647 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[0,1,2,0,4,5,6,7]
3648 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm12[0,1,2,0,4,5,6,7]
3649 ; SSE-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
3650 ; SSE-NEXT: movsd {{.*#+}} xmm10 = xmm2[0],xmm10[1]
3651 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3652 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
3653 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3654 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3655 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
3656 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3657 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
3658 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[2,0,2,3,4,5,6,7]
3659 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
3660 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3661 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
3662 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3663 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3664 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
3665 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3666 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
3667 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm1[0,1,2,0,4,5,6,7]
3668 ; SSE-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
3669 ; SSE-NEXT: movsd {{.*#+}} xmm9 = xmm2[0],xmm9[1]
3670 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
3671 ; SSE-NEXT: # xmm7 = mem[3,1,2,3]
3672 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
3673 ; SSE-NEXT: # xmm6 = mem[3,1,2,3]
3674 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7]
3675 ; SSE-NEXT: pshuflw {{.*#+}} xmm11 = xmm6[2,0,2,3,4,5,6,7]
3676 ; SSE-NEXT: punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
3677 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3678 ; SSE-NEXT: # xmm4 = mem[3,1,2,3]
3679 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3680 ; SSE-NEXT: # xmm3 = mem[3,1,2,3]
3681 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[0,1,2,0,4,5,6,7]
3682 ; SSE-NEXT: pshuflw {{.*#+}} xmm5 = xmm3[0,1,2,0,4,5,6,7]
3683 ; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
3684 ; SSE-NEXT: movsd {{.*#+}} xmm5 = xmm11[0],xmm5[1]
3685 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3686 ; SSE-NEXT: # xmm0 = mem[3,1,2,3]
3687 ; SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3688 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
3689 ; SSE-NEXT: # xmm2 = mem[3,1,2,3]
3690 ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3691 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[2,0,2,3,4,5,6,7]
3692 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[2,0,2,3,4,5,6,7]
3693 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
3694 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3695 ; SSE-NEXT: # xmm1 = mem[3,1,2,3]
3696 ; SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3697 ; SSE-NEXT: pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
3698 ; SSE-NEXT: # xmm11 = mem[3,1,2,3]
3699 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
3700 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,1,2,0,4,5,6,7]
3701 ; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
3702 ; SSE-NEXT: movsd {{.*#+}} xmm2 = xmm0[0],xmm2[1]
3703 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
3704 ; SSE-NEXT: # xmm0 = mem[3,1,2,3,4,5,6,7]
3705 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3706 ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
3707 ; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
3708 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm13[0,1,3,1,4,5,6,7]
3709 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,1,3,1,4,5,6,7]
3710 ; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
3711 ; SSE-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
3712 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3713 ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
3714 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
3715 ; SSE-NEXT: # xmm13 = mem[3,1,2,3,4,5,6,7]
3716 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
3717 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3718 ; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
3719 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
3720 ; SSE-NEXT: # xmm8 = mem[0,1,3,1,4,5,6,7]
3721 ; SSE-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
3722 ; SSE-NEXT: movsd {{.*#+}} xmm8 = xmm13[0],xmm8[1]
3723 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7]
3724 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
3725 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
3726 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,1,3,1,4,5,6,7]
3727 ; SSE-NEXT: pshuflw {{.*#+}} xmm13 = xmm3[0,1,3,1,4,5,6,7]
3728 ; SSE-NEXT: punpckldq {{.*#+}} xmm13 = xmm13[0],xmm1[0],xmm13[1],xmm1[1]
3729 ; SSE-NEXT: movsd {{.*#+}} xmm13 = xmm6[0],xmm13[1]
3730 ; SSE-NEXT: pshuflw $231, (%rsp), %xmm1 # 16-byte Folded Reload
3731 ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
3732 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3733 ; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
3734 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
3735 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm15[0,1,3,1,4,5,6,7]
3736 ; SSE-NEXT: pshuflw {{.*#+}} xmm12 = xmm12[0,1,3,1,4,5,6,7]
3737 ; SSE-NEXT: punpckldq {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1]
3738 ; SSE-NEXT: movsd {{.*#+}} xmm12 = xmm3[0],xmm12[1]
3739 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3740 ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
3741 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3742 ; SSE-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
3743 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
3744 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3745 ; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
3746 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
3747 ; SSE-NEXT: # xmm15 = mem[0,1,3,1,4,5,6,7]
3748 ; SSE-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm1[0],xmm15[1],xmm1[1]
3749 ; SSE-NEXT: movsd {{.*#+}} xmm15 = xmm3[0],xmm15[1]
3750 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3751 ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
3752 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
3753 ; SSE-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7]
3754 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3755 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3756 ; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
3757 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
3758 ; SSE-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7]
3759 ; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
3760 ; SSE-NEXT: movsd {{.*#+}} xmm3 = xmm4[0],xmm3[1]
3761 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3762 ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
3763 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
3764 ; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7]
3765 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
3766 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3767 ; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
3768 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm11[0,1,3,1,4,5,6,7]
3769 ; SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
3770 ; SSE-NEXT: movsd {{.*#+}} xmm4 = xmm6[0],xmm4[1]
3771 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3772 ; SSE-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
3773 ; SSE-NEXT: pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
3774 ; SSE-NEXT: # xmm6 = mem[3,1,2,3,4,5,6,7]
3775 ; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
3776 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
3777 ; SSE-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
3778 ; SSE-NEXT: pshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
3779 ; SSE-NEXT: # xmm7 = mem[0,1,3,1,4,5,6,7]
3780 ; SSE-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
3781 ; SSE-NEXT: movsd {{.*#+}} xmm7 = xmm6[0],xmm7[1]
3782 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3783 ; SSE-NEXT: movaps %xmm1, 96(%rsi)
3784 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3785 ; SSE-NEXT: movaps %xmm1, 32(%rsi)
3786 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3787 ; SSE-NEXT: movaps %xmm1, 112(%rsi)
3788 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3789 ; SSE-NEXT: movaps %xmm1, 48(%rsi)
3790 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3791 ; SSE-NEXT: movaps %xmm1, 64(%rsi)
3792 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3793 ; SSE-NEXT: movaps %xmm1, (%rsi)
3794 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3795 ; SSE-NEXT: movaps %xmm1, 80(%rsi)
3796 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3797 ; SSE-NEXT: movaps %xmm1, 16(%rsi)
3798 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3799 ; SSE-NEXT: movaps %xmm1, 96(%rdx)
3800 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3801 ; SSE-NEXT: movaps %xmm1, 32(%rdx)
3802 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3803 ; SSE-NEXT: movaps %xmm1, 112(%rdx)
3804 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3805 ; SSE-NEXT: movaps %xmm1, 48(%rdx)
3806 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3807 ; SSE-NEXT: movaps %xmm1, 64(%rdx)
3808 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3809 ; SSE-NEXT: movaps %xmm1, (%rdx)
3810 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3811 ; SSE-NEXT: movaps %xmm1, 80(%rdx)
3812 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3813 ; SSE-NEXT: movaps %xmm1, 16(%rdx)
3814 ; SSE-NEXT: movapd %xmm2, 96(%rcx)
3815 ; SSE-NEXT: movapd %xmm5, 32(%rcx)
3816 ; SSE-NEXT: movapd %xmm9, 112(%rcx)
3817 ; SSE-NEXT: movapd %xmm10, 48(%rcx)
3818 ; SSE-NEXT: movapd %xmm14, 64(%rcx)
3819 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3820 ; SSE-NEXT: movaps %xmm1, (%rcx)
3821 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3822 ; SSE-NEXT: movaps %xmm1, 80(%rcx)
3823 ; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3824 ; SSE-NEXT: movaps %xmm1, 16(%rcx)
3825 ; SSE-NEXT: movapd %xmm7, 112(%r8)
3826 ; SSE-NEXT: movapd %xmm4, 96(%r8)
3827 ; SSE-NEXT: movapd %xmm3, 80(%r8)
3828 ; SSE-NEXT: movapd %xmm15, 64(%r8)
3829 ; SSE-NEXT: movapd %xmm12, 48(%r8)
3830 ; SSE-NEXT: movapd %xmm13, 32(%r8)
3831 ; SSE-NEXT: movapd %xmm8, 16(%r8)
3832 ; SSE-NEXT: movapd %xmm0, (%r8)
3833 ; SSE-NEXT: addq $824, %rsp # imm = 0x338
3836 ; AVX-LABEL: load_i16_stride4_vf64:
3838 ; AVX-NEXT: subq $776, %rsp # imm = 0x308
3839 ; AVX-NEXT: vpxor %xmm10, %xmm10, %xmm10
3840 ; AVX-NEXT: vmovdqa 368(%rdi), %xmm8
3841 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm8[0],xmm10[1,2,3],xmm8[4],xmm10[5,6,7]
3842 ; AVX-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3843 ; AVX-NEXT: vmovdqa 352(%rdi), %xmm1
3844 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3845 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7]
3846 ; AVX-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
3847 ; AVX-NEXT: vmovdqa 336(%rdi), %xmm4
3848 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm4[0],xmm10[1,2,3],xmm4[4],xmm10[5,6,7]
3849 ; AVX-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3850 ; AVX-NEXT: vmovdqa 320(%rdi), %xmm7
3851 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm7[0],xmm10[1,2,3],xmm7[4],xmm10[5,6,7]
3852 ; AVX-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3853 ; AVX-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
3854 ; AVX-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
3855 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3856 ; AVX-NEXT: vmovdqa 304(%rdi), %xmm13
3857 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm13[0],xmm10[1,2,3],xmm13[4],xmm10[5,6,7]
3858 ; AVX-NEXT: vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3859 ; AVX-NEXT: vmovdqa 288(%rdi), %xmm1
3860 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3861 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7]
3862 ; AVX-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
3863 ; AVX-NEXT: vmovdqa 272(%rdi), %xmm14
3864 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm14[0],xmm10[1,2,3],xmm14[4],xmm10[5,6,7]
3865 ; AVX-NEXT: vmovdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3866 ; AVX-NEXT: vmovdqa 256(%rdi), %xmm2
3867 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3868 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7]
3869 ; AVX-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
3870 ; AVX-NEXT: vpackusdw %xmm0, %xmm1, %xmm0
3871 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3872 ; AVX-NEXT: vmovdqa 112(%rdi), %xmm0
3873 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3874 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7]
3875 ; AVX-NEXT: vmovdqa 96(%rdi), %xmm0
3876 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3877 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7]
3878 ; AVX-NEXT: vpackusdw %xmm2, %xmm3, %xmm2
3879 ; AVX-NEXT: vmovdqa 80(%rdi), %xmm0
3880 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7]
3881 ; AVX-NEXT: vmovdqa %xmm0, %xmm3
3882 ; AVX-NEXT: vmovdqa 64(%rdi), %xmm0
3883 ; AVX-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
3884 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7]
3885 ; AVX-NEXT: vpackusdw %xmm5, %xmm6, %xmm5
3886 ; AVX-NEXT: vpackusdw %xmm2, %xmm5, %xmm2
3887 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3888 ; AVX-NEXT: vmovdqa 32(%rdi), %xmm0
3889 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3890 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1
3891 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3892 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7]
3893 ; AVX-NEXT: vpblendw {{.*#+}} xmm6 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7]
3894 ; AVX-NEXT: vpackusdw %xmm5, %xmm6, %xmm5
3895 ; AVX-NEXT: vmovdqa (%rdi), %xmm0
3896 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3897 ; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
3898 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3899 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7]
3900 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7]
3901 ; AVX-NEXT: vpackusdw %xmm9, %xmm11, %xmm9
3902 ; AVX-NEXT: vpackusdw %xmm5, %xmm9, %xmm5
3903 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3904 ; AVX-NEXT: vmovdqa 240(%rdi), %xmm0
3905 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3906 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7]
3907 ; AVX-NEXT: vmovdqa 224(%rdi), %xmm15
3908 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm15[0],xmm10[1,2,3],xmm15[4],xmm10[5,6,7]
3909 ; AVX-NEXT: vmovdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3910 ; AVX-NEXT: vpackusdw %xmm5, %xmm9, %xmm5
3911 ; AVX-NEXT: vmovdqa 208(%rdi), %xmm0
3912 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3913 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7]
3914 ; AVX-NEXT: vmovdqa 192(%rdi), %xmm0
3915 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm0[0],xmm10[1,2,3],xmm0[4],xmm10[5,6,7]
3916 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3917 ; AVX-NEXT: vpackusdw %xmm9, %xmm11, %xmm9
3918 ; AVX-NEXT: vpackusdw %xmm5, %xmm9, %xmm5
3919 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3920 ; AVX-NEXT: vmovdqa 176(%rdi), %xmm2
3921 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3922 ; AVX-NEXT: vpblendw {{.*#+}} xmm5 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7]
3923 ; AVX-NEXT: vmovdqa 160(%rdi), %xmm2
3924 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3925 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7]
3926 ; AVX-NEXT: vpackusdw %xmm5, %xmm9, %xmm5
3927 ; AVX-NEXT: vmovdqa 144(%rdi), %xmm9
3928 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3929 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm9[0],xmm10[1,2,3],xmm9[4],xmm10[5,6,7]
3930 ; AVX-NEXT: vmovdqa 128(%rdi), %xmm11
3931 ; AVX-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3932 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm11[0],xmm10[1,2,3],xmm11[4],xmm10[5,6,7]
3933 ; AVX-NEXT: vpackusdw %xmm9, %xmm11, %xmm9
3934 ; AVX-NEXT: vpackusdw %xmm5, %xmm9, %xmm5
3935 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3936 ; AVX-NEXT: vmovdqa 496(%rdi), %xmm1
3937 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3938 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7]
3939 ; AVX-NEXT: vmovdqa 480(%rdi), %xmm1
3940 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3941 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7]
3942 ; AVX-NEXT: vpackusdw %xmm9, %xmm11, %xmm9
3943 ; AVX-NEXT: vmovdqa 464(%rdi), %xmm1
3944 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3945 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7]
3946 ; AVX-NEXT: vmovdqa 448(%rdi), %xmm1
3947 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3948 ; AVX-NEXT: vpblendw {{.*#+}} xmm12 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7]
3949 ; AVX-NEXT: vpackusdw %xmm11, %xmm12, %xmm11
3950 ; AVX-NEXT: vpackusdw %xmm9, %xmm11, %xmm9
3951 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3952 ; AVX-NEXT: vmovdqa 432(%rdi), %xmm2
3953 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3954 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7]
3955 ; AVX-NEXT: vmovdqa 416(%rdi), %xmm1
3956 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3957 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm1[0],xmm10[1,2,3],xmm1[4],xmm10[5,6,7]
3958 ; AVX-NEXT: vpackusdw %xmm9, %xmm11, %xmm9
3959 ; AVX-NEXT: vmovdqa 400(%rdi), %xmm5
3960 ; AVX-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3961 ; AVX-NEXT: vpblendw {{.*#+}} xmm11 = xmm5[0],xmm10[1,2,3],xmm5[4],xmm10[5,6,7]
3962 ; AVX-NEXT: vmovdqa 384(%rdi), %xmm2
3963 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3964 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm2[0],xmm10[1,2,3],xmm2[4],xmm10[5,6,7]
3965 ; AVX-NEXT: vpackusdw %xmm11, %xmm10, %xmm10
3966 ; AVX-NEXT: vpackusdw %xmm9, %xmm10, %xmm9
3967 ; AVX-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
3968 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,2,2,3]
3969 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
3970 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
3971 ; AVX-NEXT: # xmm10 = mem[0,2,2,3]
3972 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
3973 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
3974 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm4[0,2,2,3]
3975 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
3976 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm7[0,2,2,3]
3977 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
3978 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
3979 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7]
3980 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm13[0,2,2,3]
3981 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
3982 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
3983 ; AVX-NEXT: # xmm11 = mem[0,2,2,3]
3984 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7]
3985 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
3986 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm14[0,2,2,3]
3987 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
3988 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
3989 ; AVX-NEXT: # xmm12 = mem[0,2,2,3]
3990 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
3991 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
3992 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7]
3993 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
3994 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
3995 ; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
3996 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
3997 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[0,2,2,3]
3998 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
3999 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4000 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm2[0,2,2,3]
4001 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
4002 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
4003 ; AVX-NEXT: vmovdqa %xmm3, %xmm4
4004 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm3[0,2,2,3]
4005 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
4006 ; AVX-NEXT: vmovdqa (%rsp), %xmm3 # 16-byte Reload
4007 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm3[0,2,2,3]
4008 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
4009 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
4010 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7]
4011 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
4012 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm6[0,2,2,3]
4013 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
4014 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
4015 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm5[0,2,2,3]
4016 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7]
4017 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
4018 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
4019 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm8[0,2,2,3]
4020 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
4021 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
4022 ; AVX-NEXT: vpshufd {{.*#+}} xmm12 = xmm7[0,2,2,3]
4023 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
4024 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
4025 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7]
4026 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
4027 ; AVX-NEXT: vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
4028 ; AVX-NEXT: vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4029 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
4030 ; AVX-NEXT: # xmm9 = mem[0,2,2,3]
4031 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
4032 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm15[0,2,2,3]
4033 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
4034 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
4035 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
4036 ; AVX-NEXT: # xmm10 = mem[0,2,2,3]
4037 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
4038 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm0[0,2,2,3]
4039 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
4040 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
4041 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7]
4042 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
4043 ; AVX-NEXT: # xmm10 = mem[0,2,2,3]
4044 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
4045 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
4046 ; AVX-NEXT: # xmm11 = mem[0,2,2,3]
4047 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7]
4048 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
4049 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
4050 ; AVX-NEXT: # xmm11 = mem[0,2,2,3]
4051 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
4052 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
4053 ; AVX-NEXT: # xmm12 = mem[0,2,2,3]
4054 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
4055 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
4056 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7]
4057 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
4058 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7]
4059 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4060 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
4061 ; AVX-NEXT: vpshufd {{.*#+}} xmm9 = xmm13[0,2,2,3]
4062 ; AVX-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
4063 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
4064 ; AVX-NEXT: vpshufd {{.*#+}} xmm10 = xmm14[0,2,2,3]
4065 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
4066 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
4067 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
4068 ; AVX-NEXT: # xmm10 = mem[0,2,2,3]
4069 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
4070 ; AVX-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
4071 ; AVX-NEXT: vpshufd {{.*#+}} xmm11 = xmm15[0,2,2,3]
4072 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
4073 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
4074 ; AVX-NEXT: vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3],xmm9[4,5,6,7]
4075 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
4076 ; AVX-NEXT: # xmm10 = mem[0,2,2,3]
4077 ; AVX-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
4078 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
4079 ; AVX-NEXT: # xmm11 = mem[0,2,2,3]
4080 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[0,1,1,3,4,5,6,7]
4081 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
4082 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
4083 ; AVX-NEXT: # xmm11 = mem[0,2,2,3]
4084 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
4085 ; AVX-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
4086 ; AVX-NEXT: # xmm12 = mem[0,2,2,3]
4087 ; AVX-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
4088 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
4089 ; AVX-NEXT: vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3],xmm10[4,5,6,7]
4090 ; AVX-NEXT: vinsertf128 $1, %xmm9, %ymm0, %ymm9
4091 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm10[0,1,2,3],ymm9[4,5,6,7]
4092 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4093 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
4094 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4095 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[3,1,2,3]
4096 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4097 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
4098 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
4099 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4100 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm4[3,1,2,3]
4101 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4102 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,3]
4103 ; AVX-NEXT: vmovdqa %xmm2, (%rsp) # 16-byte Spill
4104 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
4105 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
4106 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
4107 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
4108 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[3,1,2,3]
4109 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4110 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3]
4111 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4112 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
4113 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
4114 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4115 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[3,1,2,3]
4116 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4117 ; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm7[3,1,2,3]
4118 ; AVX-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4119 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
4120 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7]
4121 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4122 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
4123 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4124 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4125 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4126 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4127 ; AVX-NEXT: # xmm0 = mem[3,1,2,3]
4128 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4129 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4130 ; AVX-NEXT: # xmm1 = mem[3,1,2,3]
4131 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4132 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
4133 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
4134 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4135 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4136 ; AVX-NEXT: # xmm1 = mem[3,1,2,3]
4137 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4138 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4139 ; AVX-NEXT: # xmm2 = mem[3,1,2,3]
4140 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4141 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
4142 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
4143 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4144 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
4145 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4146 ; AVX-NEXT: # xmm1 = mem[3,1,2,3]
4147 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4148 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4149 ; AVX-NEXT: # xmm2 = mem[3,1,2,3]
4150 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4151 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
4152 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
4153 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4154 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
4155 ; AVX-NEXT: # xmm12 = mem[3,1,2,3]
4156 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
4157 ; AVX-NEXT: # xmm11 = mem[3,1,2,3]
4158 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,0,2,3,4,5,6,7]
4159 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,0,2,3,4,5,6,7]
4160 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4161 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
4162 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4163 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4164 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4165 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm13[3,1,2,3]
4166 ; AVX-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4167 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3]
4168 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4169 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
4170 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm2[0,1,2,0,4,5,6,7]
4171 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4172 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4173 ; AVX-NEXT: # xmm1 = mem[3,1,2,3]
4174 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4175 ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3]
4176 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4177 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
4178 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
4179 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4180 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
4181 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4182 ; AVX-NEXT: # xmm1 = mem[3,1,2,3]
4183 ; AVX-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4184 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4185 ; AVX-NEXT: # xmm2 = mem[3,1,2,3]
4186 ; AVX-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4187 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
4188 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
4189 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4190 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
4191 ; AVX-NEXT: # xmm13 = mem[3,1,2,3]
4192 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
4193 ; AVX-NEXT: # xmm14 = mem[3,1,2,3]
4194 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm13[2,0,2,3,4,5,6,7]
4195 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[2,0,2,3,4,5,6,7]
4196 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4197 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
4198 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4199 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4200 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4201 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
4202 ; AVX-NEXT: # xmm10 = mem[3,1,2,3]
4203 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
4204 ; AVX-NEXT: # xmm9 = mem[3,1,2,3]
4205 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,2,0,4,5,6,7]
4206 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,2,0,4,5,6,7]
4207 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4208 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
4209 ; AVX-NEXT: # xmm8 = mem[3,1,2,3]
4210 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
4211 ; AVX-NEXT: # xmm7 = mem[3,1,2,3]
4212 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,0,2,3,4,5,6,7]
4213 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7]
4214 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4215 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm1[0,1,2,3],xmm0[4,5,6,7]
4216 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
4217 ; AVX-NEXT: # xmm6 = mem[3,1,2,3]
4218 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4219 ; AVX-NEXT: # xmm5 = mem[3,1,2,3]
4220 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7]
4221 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7]
4222 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4223 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4224 ; AVX-NEXT: # xmm4 = mem[3,1,2,3]
4225 ; AVX-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4226 ; AVX-NEXT: # xmm2 = mem[3,1,2,3]
4227 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7]
4228 ; AVX-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7]
4229 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
4230 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
4231 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm1
4232 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
4233 ; AVX-NEXT: vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4234 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4235 ; AVX-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
4236 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4237 ; AVX-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
4238 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4239 ; AVX-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4240 ; AVX-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
4241 ; AVX-NEXT: vpshuflw $231, (%rsp), %xmm3 # 16-byte Folded Reload
4242 ; AVX-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
4243 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
4244 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
4245 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4246 ; AVX-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
4247 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4248 ; AVX-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7]
4249 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
4250 ; AVX-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4251 ; AVX-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
4252 ; AVX-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4253 ; AVX-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7]
4254 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm3[0],xmm15[1],xmm3[1]
4255 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
4256 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4257 ; AVX-NEXT: vblendps {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4258 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4259 ; AVX-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
4260 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4261 ; AVX-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
4262 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4263 ; AVX-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4264 ; AVX-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
4265 ; AVX-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4266 ; AVX-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
4267 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
4268 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
4269 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4270 ; AVX-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
4271 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4272 ; AVX-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7]
4273 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
4274 ; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7]
4275 ; AVX-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
4276 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1]
4277 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
4278 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4279 ; AVX-NEXT: vblendps {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4280 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm10[0,1,3,1,4,5,6,7]
4281 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,3,1,4,5,6,7]
4282 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4283 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[3,1,2,3,4,5,6,7]
4284 ; AVX-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
4285 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
4286 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
4287 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,3,1,4,5,6,7]
4288 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
4289 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
4290 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
4291 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
4292 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
4293 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
4294 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
4295 ; AVX-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4296 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4297 ; AVX-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
4298 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4299 ; AVX-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7]
4300 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4301 ; AVX-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4302 ; AVX-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7]
4303 ; AVX-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4304 ; AVX-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7]
4305 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
4306 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
4307 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4308 ; AVX-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7]
4309 ; AVX-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4310 ; AVX-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7]
4311 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
4312 ; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[3,1,2,3,4,5,6,7]
4313 ; AVX-NEXT: vpshuflw {{.*#+}} xmm5 = xmm14[3,1,2,3,4,5,6,7]
4314 ; AVX-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
4315 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
4316 ; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
4317 ; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
4318 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4319 ; AVX-NEXT: vmovaps %xmm2, 96(%rsi)
4320 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4321 ; AVX-NEXT: vmovaps %xmm2, 112(%rsi)
4322 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4323 ; AVX-NEXT: vmovaps %xmm2, 32(%rsi)
4324 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4325 ; AVX-NEXT: vmovaps %xmm2, 48(%rsi)
4326 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4327 ; AVX-NEXT: vmovaps %xmm2, (%rsi)
4328 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4329 ; AVX-NEXT: vmovaps %xmm2, 16(%rsi)
4330 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4331 ; AVX-NEXT: vmovaps %xmm2, 64(%rsi)
4332 ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
4333 ; AVX-NEXT: vmovaps %xmm2, 80(%rsi)
4334 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4335 ; AVX-NEXT: vmovaps %ymm2, 96(%rdx)
4336 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4337 ; AVX-NEXT: vmovaps %ymm2, 32(%rdx)
4338 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4339 ; AVX-NEXT: vmovaps %ymm2, (%rdx)
4340 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4341 ; AVX-NEXT: vmovaps %ymm2, 64(%rdx)
4342 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4343 ; AVX-NEXT: vmovaps %ymm2, 32(%rcx)
4344 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4345 ; AVX-NEXT: vmovaps %ymm2, 96(%rcx)
4346 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4347 ; AVX-NEXT: vmovaps %ymm2, 64(%rcx)
4348 ; AVX-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
4349 ; AVX-NEXT: vmovaps %ymm2, (%rcx)
4350 ; AVX-NEXT: vmovaps %ymm1, 96(%r8)
4351 ; AVX-NEXT: vmovaps %ymm0, 32(%r8)
4352 ; AVX-NEXT: vmovaps %ymm3, 64(%r8)
4353 ; AVX-NEXT: vmovaps %ymm15, (%r8)
4354 ; AVX-NEXT: addq $776, %rsp # imm = 0x308
4355 ; AVX-NEXT: vzeroupper
4358 ; AVX2-LABEL: load_i16_stride4_vf64:
4360 ; AVX2-NEXT: subq $696, %rsp # imm = 0x2B8
4361 ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0
4362 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4363 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
4364 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4365 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4366 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
4367 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4368 ; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
4369 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4370 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
4371 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4372 ; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
4373 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4374 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4375 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
4376 ; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
4377 ; AVX2-NEXT: vpackusdw %xmm3, %xmm3, %xmm3
4378 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
4379 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
4380 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
4381 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4382 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4383 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
4384 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4385 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4386 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
4387 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4388 ; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
4389 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4390 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
4391 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4392 ; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
4393 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4394 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4395 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
4396 ; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
4397 ; AVX2-NEXT: vpackusdw %xmm3, %xmm3, %xmm3
4398 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
4399 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
4400 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
4401 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4402 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4403 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
4404 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4405 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4406 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
4407 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4408 ; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
4409 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4410 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
4411 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4412 ; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
4413 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4414 ; AVX2-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4415 ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm4
4416 ; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
4417 ; AVX2-NEXT: vpackusdw %xmm3, %xmm3, %xmm3
4418 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
4419 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
4420 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
4421 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4422 ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4423 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
4424 ; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4425 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4426 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
4427 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4428 ; AVX2-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
4429 ; AVX2-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4430 ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3
4431 ; AVX2-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4432 ; AVX2-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
4433 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4434 ; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4435 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
4436 ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
4437 ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
4438 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4439 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
4440 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4441 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4442 ; AVX2-NEXT: vmovdqa 256(%rdi), %xmm4
4443 ; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4444 ; AVX2-NEXT: vmovdqa 272(%rdi), %xmm3
4445 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4446 ; AVX2-NEXT: vmovdqa 288(%rdi), %xmm5
4447 ; AVX2-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill
4448 ; AVX2-NEXT: vmovdqa 304(%rdi), %xmm6
4449 ; AVX2-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4450 ; AVX2-NEXT: vmovdqa 368(%rdi), %xmm0
4451 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4452 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4453 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
4454 ; AVX2-NEXT: vmovdqa 352(%rdi), %xmm1
4455 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4456 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4457 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
4458 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4459 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4460 ; AVX2-NEXT: vmovdqa 336(%rdi), %xmm1
4461 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4462 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4463 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
4464 ; AVX2-NEXT: vmovdqa 320(%rdi), %xmm2
4465 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4466 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
4467 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
4468 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4469 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4470 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4471 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm6[0,2,2,3]
4472 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
4473 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[0,2,2,3]
4474 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
4475 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4476 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
4477 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
4478 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
4479 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
4480 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4481 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
4482 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4483 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4484 ; AVX2-NEXT: vmovdqa 112(%rdi), %xmm0
4485 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4486 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4487 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
4488 ; AVX2-NEXT: vmovdqa 96(%rdi), %xmm1
4489 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4490 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4491 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
4492 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4493 ; AVX2-NEXT: vmovdqa 80(%rdi), %xmm1
4494 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4495 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4496 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
4497 ; AVX2-NEXT: vmovdqa 64(%rdi), %xmm2
4498 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4499 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
4500 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
4501 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4502 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4503 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4504 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4505 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm14
4506 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm14[0,2,2,3]
4507 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
4508 ; AVX2-NEXT: vmovdqa 32(%rdi), %xmm8
4509 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm8[0,2,2,3]
4510 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
4511 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4512 ; AVX2-NEXT: vmovdqa 16(%rdi), %xmm12
4513 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[0,2,2,3]
4514 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
4515 ; AVX2-NEXT: vmovdqa (%rdi), %xmm9
4516 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm9[0,2,2,3]
4517 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
4518 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
4519 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
4520 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4521 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4522 ; AVX2-NEXT: vmovdqa 240(%rdi), %xmm0
4523 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4524 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
4525 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
4526 ; AVX2-NEXT: vmovdqa 224(%rdi), %xmm1
4527 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4528 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4529 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
4530 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4531 ; AVX2-NEXT: vmovdqa 208(%rdi), %xmm1
4532 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4533 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4534 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
4535 ; AVX2-NEXT: vmovdqa 192(%rdi), %xmm2
4536 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4537 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
4538 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
4539 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4540 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4541 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4542 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4543 ; AVX2-NEXT: vmovdqa 176(%rdi), %xmm1
4544 ; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4545 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
4546 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
4547 ; AVX2-NEXT: vmovdqa 160(%rdi), %xmm2
4548 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4549 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
4550 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
4551 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4552 ; AVX2-NEXT: vmovdqa 144(%rdi), %xmm2
4553 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4554 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
4555 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
4556 ; AVX2-NEXT: vmovdqa 128(%rdi), %xmm3
4557 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4558 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[0,2,2,3]
4559 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
4560 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
4561 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
4562 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4563 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4564 ; AVX2-NEXT: vmovdqa 496(%rdi), %xmm11
4565 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[0,2,2,3]
4566 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
4567 ; AVX2-NEXT: vmovdqa 480(%rdi), %xmm13
4568 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[0,2,2,3]
4569 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
4570 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4571 ; AVX2-NEXT: vmovdqa 464(%rdi), %xmm15
4572 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm15[0,2,2,3]
4573 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
4574 ; AVX2-NEXT: vmovdqa 448(%rdi), %xmm6
4575 ; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
4576 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
4577 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
4578 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4579 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4580 ; AVX2-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4581 ; AVX2-NEXT: vmovdqa 432(%rdi), %xmm5
4582 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm5[0,2,2,3]
4583 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
4584 ; AVX2-NEXT: vmovdqa 416(%rdi), %xmm4
4585 ; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm4[0,2,2,3]
4586 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
4587 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
4588 ; AVX2-NEXT: vmovdqa 400(%rdi), %xmm7
4589 ; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[0,2,2,3]
4590 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[1,3,2,3,4,5,6,7]
4591 ; AVX2-NEXT: vmovdqa 384(%rdi), %xmm1
4592 ; AVX2-NEXT: vpshufd {{.*#+}} xmm10 = xmm1[0,2,2,3]
4593 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
4594 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
4595 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
4596 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
4597 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4598 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4599 ; AVX2-NEXT: # xmm0 = mem[3,1,2,3]
4600 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4601 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4602 ; AVX2-NEXT: # xmm2 = mem[3,1,2,3]
4603 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4604 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
4605 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
4606 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
4607 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4608 ; AVX2-NEXT: # xmm2 = mem[3,1,2,3]
4609 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4610 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4611 ; AVX2-NEXT: # xmm3 = mem[3,1,2,3]
4612 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4613 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
4614 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7]
4615 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4616 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4617 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4618 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
4619 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3]
4620 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4621 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm8[3,1,2,3]
4622 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4623 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
4624 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7]
4625 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4626 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[3,1,2,3]
4627 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4628 ; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm9[3,1,2,3]
4629 ; AVX2-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4630 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7]
4631 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[2,0,2,3,4,5,6,7]
4632 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm8[0],xmm3[0],xmm8[1],xmm3[1]
4633 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
4634 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
4635 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4636 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4637 ; AVX2-NEXT: # xmm0 = mem[3,1,2,3]
4638 ; AVX2-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4639 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4640 ; AVX2-NEXT: # xmm2 = mem[3,1,2,3]
4641 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4642 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
4643 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
4644 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
4645 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4646 ; AVX2-NEXT: # xmm2 = mem[3,1,2,3]
4647 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4648 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4649 ; AVX2-NEXT: # xmm3 = mem[3,1,2,3]
4650 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4651 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
4652 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7]
4653 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4654 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4655 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4656 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
4657 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4658 ; AVX2-NEXT: # xmm2 = mem[3,1,2,3]
4659 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4660 ; AVX2-NEXT: vpshufd $231, (%rsp), %xmm3 # 16-byte Folded Reload
4661 ; AVX2-NEXT: # xmm3 = mem[3,1,2,3]
4662 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4663 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
4664 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7]
4665 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4666 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
4667 ; AVX2-NEXT: # xmm12 = mem[3,1,2,3]
4668 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
4669 ; AVX2-NEXT: # xmm10 = mem[3,1,2,3]
4670 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[2,0,2,3,4,5,6,7]
4671 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm14 = xmm10[2,0,2,3,4,5,6,7]
4672 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm14[0],xmm3[0],xmm14[1],xmm3[1]
4673 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
4674 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
4675 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4676 ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm11[3,1,2,3]
4677 ; AVX2-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
4678 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[3,1,2,3]
4679 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4680 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
4681 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
4682 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
4683 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3]
4684 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4685 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,1,2,3]
4686 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4687 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
4688 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7]
4689 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4690 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4691 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4692 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
4693 ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm5[3,1,2,3]
4694 ; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4695 ; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[3,1,2,3]
4696 ; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4697 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
4698 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,2,0,4,5,6,7]
4699 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4700 ; AVX2-NEXT: vpshufd {{.*#+}} xmm13 = xmm7[3,1,2,3]
4701 ; AVX2-NEXT: vpshufd {{.*#+}} xmm11 = xmm1[3,1,2,3]
4702 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm13[2,0,2,3,4,5,6,7]
4703 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,0,2,3,4,5,6,7]
4704 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
4705 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3]
4706 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4707 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4708 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
4709 ; AVX2-NEXT: # xmm9 = mem[3,1,2,3]
4710 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
4711 ; AVX2-NEXT: # xmm8 = mem[3,1,2,3]
4712 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,1,2,0,4,5,6,7]
4713 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[0,1,2,0,4,5,6,7]
4714 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4715 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
4716 ; AVX2-NEXT: # xmm7 = mem[3,1,2,3]
4717 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
4718 ; AVX2-NEXT: # xmm6 = mem[3,1,2,3]
4719 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm7[2,0,2,3,4,5,6,7]
4720 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[2,0,2,3,4,5,6,7]
4721 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4722 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4723 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4724 ; AVX2-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4725 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
4726 ; AVX2-NEXT: # xmm5 = mem[3,1,2,3]
4727 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4728 ; AVX2-NEXT: # xmm4 = mem[3,1,2,3]
4729 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,2,0,4,5,6,7]
4730 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,2,0,4,5,6,7]
4731 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4732 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4733 ; AVX2-NEXT: # xmm3 = mem[3,1,2,3]
4734 ; AVX2-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
4735 ; AVX2-NEXT: # xmm2 = mem[3,1,2,3]
4736 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm3[2,0,2,3,4,5,6,7]
4737 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7]
4738 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
4739 ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
4740 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm14[4,5,6,7]
4741 ; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4742 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4743 ; AVX2-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
4744 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4745 ; AVX2-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
4746 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4747 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4748 ; AVX2-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
4749 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
4750 ; AVX2-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7]
4751 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
4752 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4753 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4754 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4755 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4756 ; AVX2-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
4757 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
4758 ; AVX2-NEXT: # xmm14 = mem[0,1,3,1,4,5,6,7]
4759 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
4760 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
4761 ; AVX2-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7]
4762 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
4763 ; AVX2-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7]
4764 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
4765 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm1[2,3]
4766 ; AVX2-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4767 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
4768 ; AVX2-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
4769 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4770 ; AVX2-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
4771 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4772 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4773 ; AVX2-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
4774 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
4775 ; AVX2-NEXT: # xmm14 = mem[3,1,2,3,4,5,6,7]
4776 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
4777 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4778 ; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4779 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4780 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
4781 ; AVX2-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
4782 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
4783 ; AVX2-NEXT: # xmm14 = mem[0,1,3,1,4,5,6,7]
4784 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
4785 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7]
4786 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7]
4787 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1]
4788 ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm10[0,1],xmm1[2,3]
4789 ; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4790 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm9[0,1,3,1,4,5,6,7]
4791 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7]
4792 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
4793 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
4794 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
4795 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
4796 ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4797 ; AVX2-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
4798 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],ymm0[6,7]
4799 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
4800 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
4801 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
4802 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
4803 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
4804 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
4805 ; AVX2-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3]
4806 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
4807 ; AVX2-NEXT: vpshuflw $116, (%rsp), %xmm2 # 16-byte Folded Reload
4808 ; AVX2-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7]
4809 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4810 ; AVX2-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7]
4811 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4812 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4813 ; AVX2-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
4814 ; AVX2-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4815 ; AVX2-NEXT: # xmm4 = mem[3,1,2,3,4,5,6,7]
4816 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
4817 ; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4818 ; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
4819 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
4820 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
4821 ; AVX2-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7]
4822 ; AVX2-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
4823 ; AVX2-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7]
4824 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
4825 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[3,1,2,3,4,5,6,7]
4826 ; AVX2-NEXT: vpshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7]
4827 ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
4828 ; AVX2-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
4829 ; AVX2-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
4830 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4831 ; AVX2-NEXT: vmovaps %ymm3, 32(%rsi)
4832 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4833 ; AVX2-NEXT: vmovaps %ymm3, 96(%rsi)
4834 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4835 ; AVX2-NEXT: vmovaps %ymm3, 64(%rsi)
4836 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4837 ; AVX2-NEXT: vmovaps %ymm3, (%rsi)
4838 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4839 ; AVX2-NEXT: vmovaps %ymm3, 96(%rdx)
4840 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4841 ; AVX2-NEXT: vmovaps %ymm3, 32(%rdx)
4842 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4843 ; AVX2-NEXT: vmovaps %ymm3, (%rdx)
4844 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4845 ; AVX2-NEXT: vmovaps %ymm3, 64(%rdx)
4846 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4847 ; AVX2-NEXT: vmovaps %ymm3, 32(%rcx)
4848 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4849 ; AVX2-NEXT: vmovaps %ymm3, 96(%rcx)
4850 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4851 ; AVX2-NEXT: vmovaps %ymm3, 64(%rcx)
4852 ; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
4853 ; AVX2-NEXT: vmovaps %ymm3, (%rcx)
4854 ; AVX2-NEXT: vmovdqa %ymm2, 96(%r8)
4855 ; AVX2-NEXT: vmovdqa %ymm0, 32(%r8)
4856 ; AVX2-NEXT: vmovdqa %ymm1, 64(%r8)
4857 ; AVX2-NEXT: vmovdqa %ymm15, (%r8)
4858 ; AVX2-NEXT: addq $696, %rsp # imm = 0x2B8
4859 ; AVX2-NEXT: vzeroupper
4862 ; AVX2-FP-LABEL: load_i16_stride4_vf64:
4864 ; AVX2-FP-NEXT: subq $712, %rsp # imm = 0x2C8
4865 ; AVX2-FP-NEXT: vpxor %xmm0, %xmm0, %xmm0
4866 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4867 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
4868 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4869 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4870 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
4871 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4872 ; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
4873 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4874 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
4875 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4876 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
4877 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4878 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4879 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
4880 ; AVX2-FP-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
4881 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm3, %xmm3
4882 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
4883 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
4884 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
4885 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4886 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4887 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
4888 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4889 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4890 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
4891 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4892 ; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
4893 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4894 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
4895 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4896 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
4897 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4898 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4899 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
4900 ; AVX2-FP-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
4901 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm3, %xmm3
4902 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
4903 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
4904 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
4905 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4906 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4907 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
4908 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4909 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4910 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
4911 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4912 ; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
4913 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4914 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
4915 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4916 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
4917 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4918 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4919 ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm4
4920 ; AVX2-FP-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
4921 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm3, %xmm3
4922 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
4923 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
4924 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
4925 ; AVX2-FP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4926 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4927 ; AVX2-FP-NEXT: vextracti128 $1, %ymm1, %xmm2
4928 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
4929 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4930 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
4931 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4932 ; AVX2-FP-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
4933 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4934 ; AVX2-FP-NEXT: vextracti128 $1, %ymm2, %xmm3
4935 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
4936 ; AVX2-FP-NEXT: vpackusdw %xmm2, %xmm2, %xmm2
4937 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
4938 ; AVX2-FP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm0[1,2,3],mem[4],ymm0[5,6,7],mem[8],ymm0[9,10,11],mem[12],ymm0[13,14,15]
4939 ; AVX2-FP-NEXT: vextracti128 $1, %ymm0, %xmm3
4940 ; AVX2-FP-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
4941 ; AVX2-FP-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
4942 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4943 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7]
4944 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4945 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4946 ; AVX2-FP-NEXT: vmovdqa 256(%rdi), %xmm5
4947 ; AVX2-FP-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4948 ; AVX2-FP-NEXT: vmovdqa 288(%rdi), %xmm3
4949 ; AVX2-FP-NEXT: vmovdqa %xmm3, (%rsp) # 16-byte Spill
4950 ; AVX2-FP-NEXT: vmovdqa 304(%rdi), %xmm7
4951 ; AVX2-FP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4952 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
4953 ; AVX2-FP-NEXT: vmovdqa 368(%rdi), %xmm0
4954 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4955 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm0
4956 ; AVX2-FP-NEXT: vmovdqa 352(%rdi), %xmm1
4957 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4958 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm1, %xmm1
4959 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
4960 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
4961 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm6 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
4962 ; AVX2-FP-NEXT: vmovdqa 336(%rdi), %xmm1
4963 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4964 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm1
4965 ; AVX2-FP-NEXT: vmovdqa 320(%rdi), %xmm2
4966 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4967 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
4968 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4969 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
4970 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
4971 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm7, %xmm1
4972 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm2
4973 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
4974 ; AVX2-FP-NEXT: vmovdqa 272(%rdi), %xmm2
4975 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4976 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm2, %xmm2
4977 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm5, %xmm3
4978 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4979 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
4980 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
4981 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
4982 ; AVX2-FP-NEXT: vmovdqa 112(%rdi), %xmm5
4983 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm5, %xmm2
4984 ; AVX2-FP-NEXT: vmovdqa 96(%rdi), %xmm11
4985 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm11, %xmm3
4986 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
4987 ; AVX2-FP-NEXT: vmovdqa 80(%rdi), %xmm12
4988 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm12, %xmm7
4989 ; AVX2-FP-NEXT: vmovdqa 64(%rdi), %xmm15
4990 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm15, %xmm8
4991 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
4992 ; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
4993 ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
4994 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],ymm3[6,7]
4995 ; AVX2-FP-NEXT: vmovdqa 32(%rdi), %xmm0
4996 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
4997 ; AVX2-FP-NEXT: vmovdqa 48(%rdi), %xmm3
4998 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm3, %xmm8
4999 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm9
5000 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
5001 ; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
5002 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5003 ; AVX2-FP-NEXT: vmovdqa 16(%rdi), %xmm1
5004 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5005 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm9
5006 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm10
5007 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
5008 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
5009 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7]
5010 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5011 ; AVX2-FP-NEXT: vmovdqa 240(%rdi), %xmm0
5012 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5013 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm7
5014 ; AVX2-FP-NEXT: vmovdqa 224(%rdi), %xmm0
5015 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5016 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm8
5017 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
5018 ; AVX2-FP-NEXT: vmovdqa 208(%rdi), %xmm0
5019 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5020 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm8
5021 ; AVX2-FP-NEXT: vmovdqa 192(%rdi), %xmm0
5022 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5023 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm9
5024 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
5025 ; AVX2-FP-NEXT: vinserti128 $1, %xmm7, %ymm0, %ymm7
5026 ; AVX2-FP-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
5027 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
5028 ; AVX2-FP-NEXT: vmovdqa 160(%rdi), %xmm0
5029 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5030 ; AVX2-FP-NEXT: vmovdqa 176(%rdi), %xmm1
5031 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5032 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm1, %xmm8
5033 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm0, %xmm9
5034 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
5035 ; AVX2-FP-NEXT: vmovdqa 128(%rdi), %xmm0
5036 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5037 ; AVX2-FP-NEXT: vmovdqa 144(%rdi), %xmm1
5038 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5039 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm1, %xmm9
5040 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm10
5041 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
5042 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3]
5043 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm7[4,5,6,7]
5044 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5045 ; AVX2-FP-NEXT: vmovdqa 496(%rdi), %xmm8
5046 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm8, %xmm9
5047 ; AVX2-FP-NEXT: vmovdqa 480(%rdi), %xmm7
5048 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm7, %xmm10
5049 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm13 = xmm10[0],xmm9[0],xmm10[1],xmm9[1]
5050 ; AVX2-FP-NEXT: vmovdqa 464(%rdi), %xmm10
5051 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm10, %xmm14
5052 ; AVX2-FP-NEXT: vmovdqa 448(%rdi), %xmm0
5053 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5054 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm9
5055 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1]
5056 ; AVX2-FP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13
5057 ; AVX2-FP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9
5058 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7]
5059 ; AVX2-FP-NEXT: vmovdqa 416(%rdi), %xmm14
5060 ; AVX2-FP-NEXT: vmovdqa 432(%rdi), %xmm13
5061 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm13, %xmm0
5062 ; AVX2-FP-NEXT: vpshufb %xmm4, %xmm14, %xmm4
5063 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
5064 ; AVX2-FP-NEXT: vmovdqa 384(%rdi), %xmm0
5065 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5066 ; AVX2-FP-NEXT: vmovdqa 400(%rdi), %xmm4
5067 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm4, %xmm1
5068 ; AVX2-FP-NEXT: vpshufb %xmm6, %xmm0, %xmm6
5069 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm6[0],xmm1[0],xmm6[1],xmm1[1]
5070 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
5071 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
5072 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5073 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[3,1,2,3]
5074 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5075 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm11[3,1,2,3]
5076 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5077 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
5078 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
5079 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5080 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm12[3,1,2,3]
5081 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5082 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3]
5083 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5084 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
5085 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
5086 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5087 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5088 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5089 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5090 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[3,1,2,3]
5091 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5092 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5093 ; AVX2-FP-NEXT: # xmm2 = mem[3,1,2,3]
5094 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5095 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
5096 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
5097 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5098 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5099 ; AVX2-FP-NEXT: # xmm2 = mem[3,1,2,3]
5100 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5101 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5102 ; AVX2-FP-NEXT: # xmm3 = mem[3,1,2,3]
5103 ; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5104 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
5105 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[2,0,2,3,4,5,6,7]
5106 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5107 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
5108 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5109 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5110 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
5111 ; AVX2-FP-NEXT: # xmm0 = mem[3,1,2,3]
5112 ; AVX2-FP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5113 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5114 ; AVX2-FP-NEXT: # xmm1 = mem[3,1,2,3]
5115 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5116 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
5117 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
5118 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5119 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5120 ; AVX2-FP-NEXT: # xmm1 = mem[3,1,2,3]
5121 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5122 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5123 ; AVX2-FP-NEXT: # xmm2 = mem[3,1,2,3]
5124 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5125 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
5126 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
5127 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5128 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5129 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5130 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5131 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5132 ; AVX2-FP-NEXT: # xmm1 = mem[3,1,2,3]
5133 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5134 ; AVX2-FP-NEXT: vpshufd $231, (%rsp), %xmm15 # 16-byte Folded Reload
5135 ; AVX2-FP-NEXT: # xmm15 = mem[3,1,2,3]
5136 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
5137 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm15[0,1,2,0,4,5,6,7]
5138 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5139 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
5140 ; AVX2-FP-NEXT: # xmm12 = mem[3,1,2,3]
5141 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
5142 ; AVX2-FP-NEXT: # xmm11 = mem[3,1,2,3]
5143 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm12[2,0,2,3,4,5,6,7]
5144 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[2,0,2,3,4,5,6,7]
5145 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5146 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
5147 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5148 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5149 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm0 = xmm8[3,1,2,3]
5150 ; AVX2-FP-NEXT: vmovdqa %xmm0, (%rsp) # 16-byte Spill
5151 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm7[3,1,2,3]
5152 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5153 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,2,0,4,5,6,7]
5154 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
5155 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5156 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm10[3,1,2,3]
5157 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5158 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5159 ; AVX2-FP-NEXT: # xmm2 = mem[3,1,2,3]
5160 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5161 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
5162 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
5163 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5164 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5165 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5166 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5167 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm1 = xmm13[3,1,2,3]
5168 ; AVX2-FP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5169 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3]
5170 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5171 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
5172 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
5173 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5174 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[3,1,2,3]
5175 ; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5176 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
5177 ; AVX2-FP-NEXT: # xmm10 = mem[3,1,2,3]
5178 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
5179 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm10[2,0,2,3,4,5,6,7]
5180 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5181 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
5182 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5183 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5184 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
5185 ; AVX2-FP-NEXT: # xmm14 = mem[3,1,2,3]
5186 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
5187 ; AVX2-FP-NEXT: # xmm9 = mem[3,1,2,3]
5188 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,1,2,0,4,5,6,7]
5189 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,2,0,4,5,6,7]
5190 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5191 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
5192 ; AVX2-FP-NEXT: # xmm8 = mem[3,1,2,3]
5193 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
5194 ; AVX2-FP-NEXT: # xmm7 = mem[3,1,2,3]
5195 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[2,0,2,3,4,5,6,7]
5196 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm7[2,0,2,3,4,5,6,7]
5197 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5198 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5199 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5200 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5201 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
5202 ; AVX2-FP-NEXT: # xmm6 = mem[3,1,2,3]
5203 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5204 ; AVX2-FP-NEXT: # xmm5 = mem[3,1,2,3]
5205 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7]
5206 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7]
5207 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5208 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5209 ; AVX2-FP-NEXT: # xmm4 = mem[3,1,2,3]
5210 ; AVX2-FP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5211 ; AVX2-FP-NEXT: # xmm2 = mem[3,1,2,3]
5212 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7]
5213 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm13 = xmm2[2,0,2,3,4,5,6,7]
5214 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
5215 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5216 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
5217 ; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5218 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
5219 ; AVX2-FP-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
5220 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5221 ; AVX2-FP-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
5222 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5223 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5224 ; AVX2-FP-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
5225 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5226 ; AVX2-FP-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
5227 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5228 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5229 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5230 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5231 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5232 ; AVX2-FP-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
5233 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5234 ; AVX2-FP-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7]
5235 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5236 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5237 ; AVX2-FP-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
5238 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
5239 ; AVX2-FP-NEXT: # xmm13 = mem[3,1,2,3,4,5,6,7]
5240 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm13[0],xmm3[0],xmm13[1],xmm3[1]
5241 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
5242 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5243 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
5244 ; AVX2-FP-NEXT: # xmm0 = mem[0,1,3,1,4,5,6,7]
5245 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5246 ; AVX2-FP-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
5247 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5248 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5249 ; AVX2-FP-NEXT: # xmm1 = mem[3,1,2,3,4,5,6,7]
5250 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5251 ; AVX2-FP-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
5252 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5253 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5254 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5255 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5256 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5257 ; AVX2-FP-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
5258 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,3,1,4,5,6,7]
5259 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5260 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7]
5261 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
5262 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm11[0],xmm3[0],xmm11[1],xmm3[1]
5263 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
5264 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5265 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm14[0,1,3,1,4,5,6,7]
5266 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,3,1,4,5,6,7]
5267 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5268 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm8[3,1,2,3,4,5,6,7]
5269 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7]
5270 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5271 ; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5272 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5273 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5274 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,3,1,4,5,6,7]
5275 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,3,1,4,5,6,7]
5276 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5277 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[3,1,2,3,4,5,6,7]
5278 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
5279 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
5280 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
5281 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5282 ; AVX2-FP-NEXT: vpshuflw $116, (%rsp), %xmm1 # 16-byte Folded Reload
5283 ; AVX2-FP-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
5284 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5285 ; AVX2-FP-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7]
5286 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5287 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5288 ; AVX2-FP-NEXT: # xmm2 = mem[3,1,2,3,4,5,6,7]
5289 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5290 ; AVX2-FP-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
5291 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5292 ; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5293 ; AVX2-FP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
5294 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5295 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5296 ; AVX2-FP-NEXT: # xmm2 = mem[0,1,3,1,4,5,6,7]
5297 ; AVX2-FP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5298 ; AVX2-FP-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7]
5299 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5300 ; AVX2-FP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5301 ; AVX2-FP-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
5302 ; AVX2-FP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm10[3,1,2,3,4,5,6,7]
5303 ; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
5304 ; AVX2-FP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
5305 ; AVX2-FP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5306 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5307 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rsi)
5308 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5309 ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rsi)
5310 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5311 ; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rsi)
5312 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5313 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rsi)
5314 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5315 ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rdx)
5316 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5317 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rdx)
5318 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5319 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rdx)
5320 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5321 ; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rdx)
5322 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5323 ; AVX2-FP-NEXT: vmovaps %ymm2, 32(%rcx)
5324 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5325 ; AVX2-FP-NEXT: vmovaps %ymm2, 96(%rcx)
5326 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5327 ; AVX2-FP-NEXT: vmovaps %ymm2, 64(%rcx)
5328 ; AVX2-FP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5329 ; AVX2-FP-NEXT: vmovaps %ymm2, (%rcx)
5330 ; AVX2-FP-NEXT: vmovdqa %ymm1, 96(%r8)
5331 ; AVX2-FP-NEXT: vmovdqa %ymm0, 32(%r8)
5332 ; AVX2-FP-NEXT: vmovdqa %ymm11, 64(%r8)
5333 ; AVX2-FP-NEXT: vmovdqa %ymm13, (%r8)
5334 ; AVX2-FP-NEXT: addq $712, %rsp # imm = 0x2C8
5335 ; AVX2-FP-NEXT: vzeroupper
5336 ; AVX2-FP-NEXT: retq
5338 ; AVX2-FCP-LABEL: load_i16_stride4_vf64:
5339 ; AVX2-FCP: # %bb.0:
5340 ; AVX2-FCP-NEXT: subq $680, %rsp # imm = 0x2A8
5341 ; AVX2-FCP-NEXT: vmovdqa 320(%rdi), %ymm7
5342 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5343 ; AVX2-FCP-NEXT: vmovdqa 352(%rdi), %ymm6
5344 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5345 ; AVX2-FCP-NEXT: vmovdqa 64(%rdi), %ymm5
5346 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5347 ; AVX2-FCP-NEXT: vmovdqa 96(%rdi), %ymm4
5348 ; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5349 ; AVX2-FCP-NEXT: vpxor %xmm1, %xmm1, %xmm1
5350 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm0 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
5351 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm0, %xmm2
5352 ; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
5353 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
5354 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
5355 ; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
5356 ; AVX2-FCP-NEXT: vpackusdw %xmm0, %xmm2, %xmm2
5357 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,2,2,3,0,2,4,6]
5358 ; AVX2-FCP-NEXT: vpermd %ymm4, %ymm0, %ymm3
5359 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5360 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29]
5361 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3
5362 ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm5
5363 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5364 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm5
5365 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm5[0,1,2,3,4,5],ymm3[6,7]
5366 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4,5,6,7]
5367 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5368 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm2 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
5369 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm2, %xmm3
5370 ; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm2, %xmm2
5371 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
5372 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
5373 ; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
5374 ; AVX2-FCP-NEXT: vpackusdw %xmm2, %xmm3, %xmm3
5375 ; AVX2-FCP-NEXT: vpermd %ymm6, %ymm0, %ymm2
5376 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm5
5377 ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm6
5378 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm6, %ymm7
5379 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7]
5380 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7]
5381 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5382 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
5383 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
5384 ; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
5385 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm5 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
5386 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm5, %xmm7
5387 ; AVX2-FCP-NEXT: vpackusdw %xmm7, %xmm5, %xmm5
5388 ; AVX2-FCP-NEXT: vmovdqa 448(%rdi), %ymm7
5389 ; AVX2-FCP-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5390 ; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm5, %xmm3
5391 ; AVX2-FCP-NEXT: vmovdqa 480(%rdi), %ymm5
5392 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5393 ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm5
5394 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5395 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm5, %ymm5
5396 ; AVX2-FCP-NEXT: vpermd %ymm7, %ymm0, %ymm14
5397 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm14, %ymm7
5398 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm5 = ymm7[0,1,2,3,4,5],ymm5[6,7]
5399 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm5[4,5,6,7]
5400 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5401 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm3 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
5402 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm3, %xmm5
5403 ; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm3, %xmm3
5404 ; AVX2-FCP-NEXT: vpblendw {{.*#+}} ymm1 = mem[0],ymm1[1,2,3],mem[4],ymm1[5,6,7],mem[8],ymm1[9,10,11],mem[12],ymm1[13,14,15]
5405 ; AVX2-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5
5406 ; AVX2-FCP-NEXT: vpackusdw %xmm5, %xmm1, %xmm1
5407 ; AVX2-FCP-NEXT: vpackusdw %xmm3, %xmm1, %xmm1
5408 ; AVX2-FCP-NEXT: vmovdqa 192(%rdi), %ymm5
5409 ; AVX2-FCP-NEXT: vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5410 ; AVX2-FCP-NEXT: vmovdqa 224(%rdi), %ymm3
5411 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5412 ; AVX2-FCP-NEXT: vpermd %ymm3, %ymm0, %ymm9
5413 ; AVX2-FCP-NEXT: vpermd %ymm5, %ymm0, %ymm3
5414 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm9, %ymm0
5415 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm5
5416 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3,4,5],ymm0[6,7]
5417 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5418 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5419 ; AVX2-FCP-NEXT: vmovdqa 288(%rdi), %xmm1
5420 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5421 ; AVX2-FCP-NEXT: vmovdqa 304(%rdi), %xmm12
5422 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm5 = [0,1,2,3,2,3,10,11,8,9,10,11,12,13,14,15]
5423 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm12, %xmm0
5424 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm1
5425 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
5426 ; AVX2-FCP-NEXT: vmovdqa 256(%rdi), %xmm10
5427 ; AVX2-FCP-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5428 ; AVX2-FCP-NEXT: vmovdqa 272(%rdi), %xmm7
5429 ; AVX2-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5430 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm0 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
5431 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm7
5432 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm10, %xmm8
5433 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm7 = xmm8[0],xmm7[0],xmm8[1],xmm7[1]
5434 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
5435 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
5436 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2
5437 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm6, %ymm6
5438 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
5439 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5,6,7]
5440 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5441 ; AVX2-FCP-NEXT: vmovdqa 32(%rdi), %xmm2
5442 ; AVX2-FCP-NEXT: vmovdqa 48(%rdi), %xmm1
5443 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm1, %xmm6
5444 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm7
5445 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
5446 ; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm7
5447 ; AVX2-FCP-NEXT: vmovdqa 16(%rdi), %xmm15
5448 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm11
5449 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm7, %xmm13
5450 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
5451 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3]
5452 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5453 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm11
5454 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5455 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm13
5456 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm13[0,1,2,3,4,5],ymm11[6,7]
5457 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3],ymm11[4,5,6,7]
5458 ; AVX2-FCP-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5459 ; AVX2-FCP-NEXT: vmovdqa 160(%rdi), %xmm8
5460 ; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5461 ; AVX2-FCP-NEXT: vmovdqa 176(%rdi), %xmm6
5462 ; AVX2-FCP-NEXT: vmovdqa %xmm6, (%rsp) # 16-byte Spill
5463 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm6
5464 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm8, %xmm11
5465 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm11[0],xmm6[0],xmm11[1],xmm6[1]
5466 ; AVX2-FCP-NEXT: vmovdqa 128(%rdi), %xmm8
5467 ; AVX2-FCP-NEXT: vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5468 ; AVX2-FCP-NEXT: vmovdqa 144(%rdi), %xmm11
5469 ; AVX2-FCP-NEXT: vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5470 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm11, %xmm11
5471 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm13
5472 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1]
5473 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm6 = xmm11[0,1],xmm6[2,3]
5474 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm9, %ymm9
5475 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3
5476 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm9[6,7]
5477 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
5478 ; AVX2-FCP-NEXT: vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5479 ; AVX2-FCP-NEXT: vmovdqa 416(%rdi), %xmm6
5480 ; AVX2-FCP-NEXT: vmovdqa 432(%rdi), %xmm13
5481 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm13, %xmm3
5482 ; AVX2-FCP-NEXT: vpshufb %xmm5, %xmm6, %xmm5
5483 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm9 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
5484 ; AVX2-FCP-NEXT: vmovdqa 384(%rdi), %xmm5
5485 ; AVX2-FCP-NEXT: vmovdqa 400(%rdi), %xmm3
5486 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm3, %xmm11
5487 ; AVX2-FCP-NEXT: vpshufb %xmm0, %xmm5, %xmm0
5488 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
5489 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm9[2,3]
5490 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
5491 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm9
5492 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm14, %ymm11
5493 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
5494 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
5495 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5496 ; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,2,3,1,3,5,7]
5497 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
5498 ; AVX2-FCP-NEXT: vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5499 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm11 # 32-byte Folded Reload
5500 ; AVX2-FCP-NEXT: vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5501 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm9
5502 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm11
5503 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm11[0,1,2,3,4,5],ymm9[6,7]
5504 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
5505 ; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5506 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
5507 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5508 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,2,0,4,5,6,7]
5509 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
5510 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5511 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm15[3,1,2,3]
5512 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5513 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[3,1,2,3]
5514 ; AVX2-FCP-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5515 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
5516 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[2,0,2,3,4,5,6,7]
5517 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
5518 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
5519 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm9[4,5,6,7]
5520 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5521 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
5522 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5523 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
5524 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5525 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
5526 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
5527 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5528 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,1,2,3]
5529 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5530 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
5531 ; AVX2-FCP-NEXT: # xmm14 = mem[3,1,2,3]
5532 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
5533 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm7 = xmm14[0,1,2,0,4,5,6,7]
5534 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
5535 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
5536 ; AVX2-FCP-NEXT: # xmm11 = mem[3,1,2,3]
5537 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
5538 ; AVX2-FCP-NEXT: # xmm9 = mem[3,1,2,3]
5539 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm12 = xmm11[2,0,2,3,4,5,6,7]
5540 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm9[2,0,2,3,4,5,6,7]
5541 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm12 = xmm15[0],xmm12[0],xmm15[1],xmm12[1]
5542 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm12[0,1],xmm2[2,3]
5543 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5544 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5545 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
5546 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5547 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
5548 ; AVX2-FCP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5549 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm1, %ymm1
5550 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm2
5551 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
5552 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm2 = xmm13[3,1,2,3]
5553 ; AVX2-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5554 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
5555 ; AVX2-FCP-NEXT: vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
5556 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,2,0,4,5,6,7]
5557 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,2,0,4,5,6,7]
5558 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
5559 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm13 = xmm3[3,1,2,3]
5560 ; AVX2-FCP-NEXT: vpshufd {{.*#+}} xmm12 = xmm5[3,1,2,3]
5561 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm13[2,0,2,3,4,5,6,7]
5562 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[2,0,2,3,4,5,6,7]
5563 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
5564 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
5565 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5566 ; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5567 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm8 # 32-byte Folded Reload
5568 ; AVX2-FCP-NEXT: vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm7 # 32-byte Folded Reload
5569 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm8, %ymm0
5570 ; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm7, %ymm1
5571 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5572 ; AVX2-FCP-NEXT: vpshufd $231, (%rsp), %xmm6 # 16-byte Folded Reload
5573 ; AVX2-FCP-NEXT: # xmm6 = mem[3,1,2,3]
5574 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
5575 ; AVX2-FCP-NEXT: # xmm5 = mem[3,1,2,3]
5576 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[0,1,2,0,4,5,6,7]
5577 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm5[0,1,2,0,4,5,6,7]
5578 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
5579 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5580 ; AVX2-FCP-NEXT: # xmm4 = mem[3,1,2,3]
5581 ; AVX2-FCP-NEXT: vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
5582 ; AVX2-FCP-NEXT: # xmm2 = mem[3,1,2,3]
5583 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[2,0,2,3,4,5,6,7]
5584 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm15 = xmm2[2,0,2,3,4,5,6,7]
5585 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1]
5586 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
5587 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
5588 ; AVX2-FCP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
5589 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5590 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0
5591 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5592 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1
5593 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5594 ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5595 ; AVX2-FCP-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
5596 ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5597 ; AVX2-FCP-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7]
5598 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5599 ; AVX2-FCP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5600 ; AVX2-FCP-NEXT: # xmm3 = mem[3,1,2,3,4,5,6,7]
5601 ; AVX2-FCP-NEXT: vpshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
5602 ; AVX2-FCP-NEXT: # xmm15 = mem[3,1,2,3,4,5,6,7]
5603 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm15[0],xmm3[0],xmm15[1],xmm3[1]
5604 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
5605 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm15 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5606 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
5607 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm0, %ymm0
5608 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
5609 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm1, %ymm1
5610 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5611 ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
5612 ; AVX2-FCP-NEXT: # xmm1 = mem[0,1,3,1,4,5,6,7]
5613 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm14[0,1,3,1,4,5,6,7]
5614 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5615 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7]
5616 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7]
5617 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm9[0],xmm3[0],xmm9[1],xmm3[1]
5618 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm1 = xmm3[0,1],xmm1[2,3]
5619 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5620 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm8, %ymm1
5621 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm7, %ymm3
5622 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
5623 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,3,1,4,5,6,7]
5624 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,1,4,5,6,7]
5625 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
5626 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
5627 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
5628 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
5629 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3]
5630 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5631 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
5632 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm2, %ymm2
5633 ; AVX2-FCP-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5634 ; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm3, %ymm3
5635 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
5636 ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
5637 ; AVX2-FCP-NEXT: # xmm3 = mem[0,1,3,1,4,5,6,7]
5638 ; AVX2-FCP-NEXT: vpshuflw $116, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
5639 ; AVX2-FCP-NEXT: # xmm4 = mem[0,1,3,1,4,5,6,7]
5640 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
5641 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[3,1,2,3,4,5,6,7]
5642 ; AVX2-FCP-NEXT: vpshuflw {{.*#+}} xmm5 = xmm12[3,1,2,3,4,5,6,7]
5643 ; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5644 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} xmm3 = xmm4[0,1],xmm3[2,3]
5645 ; AVX2-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
5646 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5647 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rsi)
5648 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5649 ; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rsi)
5650 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5651 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rsi)
5652 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5653 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rsi)
5654 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5655 ; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rdx)
5656 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5657 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rdx)
5658 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5659 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rdx)
5660 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5661 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rdx)
5662 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5663 ; AVX2-FCP-NEXT: vmovaps %ymm3, 32(%rcx)
5664 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5665 ; AVX2-FCP-NEXT: vmovaps %ymm3, 96(%rcx)
5666 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5667 ; AVX2-FCP-NEXT: vmovaps %ymm3, 64(%rcx)
5668 ; AVX2-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
5669 ; AVX2-FCP-NEXT: vmovaps %ymm3, (%rcx)
5670 ; AVX2-FCP-NEXT: vmovdqa %ymm2, 96(%r8)
5671 ; AVX2-FCP-NEXT: vmovdqa %ymm1, 32(%r8)
5672 ; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%r8)
5673 ; AVX2-FCP-NEXT: vmovdqa %ymm15, (%r8)
5674 ; AVX2-FCP-NEXT: addq $680, %rsp # imm = 0x2A8
5675 ; AVX2-FCP-NEXT: vzeroupper
5676 ; AVX2-FCP-NEXT: retq
5678 ; AVX512-LABEL: load_i16_stride4_vf64:
5680 ; AVX512-NEXT: subq $200, %rsp
5681 ; AVX512-NEXT: vmovdqa64 256(%rdi), %zmm26
5682 ; AVX512-NEXT: vmovdqa64 384(%rdi), %zmm27
5683 ; AVX512-NEXT: vmovdqa64 (%rdi), %zmm28
5684 ; AVX512-NEXT: vmovdqa64 128(%rdi), %zmm29
5685 ; AVX512-NEXT: vmovdqa 192(%rdi), %ymm0
5686 ; AVX512-NEXT: vpmovqw %ymm0, %xmm0
5687 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5688 ; AVX512-NEXT: vmovdqa 240(%rdi), %xmm14
5689 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3]
5690 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,0,2,4,5,6,7]
5691 ; AVX512-NEXT: vmovdqa 224(%rdi), %xmm13
5692 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[0,2,2,3]
5693 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7]
5694 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
5695 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5696 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
5697 ; AVX512-NEXT: vpmovqw %zmm29, %xmm1
5698 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5699 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm1
5700 ; AVX512-NEXT: vpmovqw %ymm1, %xmm1
5701 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5702 ; AVX512-NEXT: vmovdqa 112(%rdi), %xmm12
5703 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3]
5704 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,0,2,4,5,6,7]
5705 ; AVX512-NEXT: vmovdqa 96(%rdi), %xmm11
5706 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3]
5707 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[0,1,0,2,4,5,6,7]
5708 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5709 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
5710 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
5711 ; AVX512-NEXT: vpmovqw %zmm28, %xmm4
5712 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
5713 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3]
5714 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5715 ; AVX512-NEXT: vmovdqa 448(%rdi), %ymm0
5716 ; AVX512-NEXT: vpmovqw %ymm0, %xmm0
5717 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5718 ; AVX512-NEXT: vmovdqa64 496(%rdi), %xmm24
5719 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm24[0,2,2,3]
5720 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,0,2,4,5,6,7]
5721 ; AVX512-NEXT: vmovdqa64 480(%rdi), %xmm23
5722 ; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm23[0,2,2,3]
5723 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,1,0,2,4,5,6,7]
5724 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
5725 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5726 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
5727 ; AVX512-NEXT: vpmovqw %zmm27, %xmm1
5728 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5729 ; AVX512-NEXT: vmovdqa 320(%rdi), %ymm1
5730 ; AVX512-NEXT: vpmovqw %ymm1, %xmm1
5731 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5732 ; AVX512-NEXT: vmovdqa64 368(%rdi), %xmm31
5733 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm31[0,2,2,3]
5734 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,0,2,4,5,6,7]
5735 ; AVX512-NEXT: vmovdqa64 352(%rdi), %xmm25
5736 ; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm25[0,2,2,3]
5737 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[0,1,0,2,4,5,6,7]
5738 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
5739 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5740 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
5741 ; AVX512-NEXT: vpmovqw %zmm26, %xmm1
5742 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5743 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[0,1,2,3]
5744 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5745 ; AVX512-NEXT: vmovdqa64 320(%rdi), %xmm30
5746 ; AVX512-NEXT: vmovdqa64 336(%rdi), %xmm17
5747 ; AVX512-NEXT: vmovdqa64 448(%rdi), %xmm18
5748 ; AVX512-NEXT: vmovdqa64 464(%rdi), %xmm19
5749 ; AVX512-NEXT: vmovdqa64 64(%rdi), %xmm20
5750 ; AVX512-NEXT: vmovdqa64 80(%rdi), %xmm21
5751 ; AVX512-NEXT: vmovdqa 192(%rdi), %xmm0
5752 ; AVX512-NEXT: vmovdqa 208(%rdi), %xmm1
5753 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,1,3,4,5,6,7]
5754 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
5755 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
5756 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
5757 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,2,2,3]
5758 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
5759 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,2,2,3]
5760 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
5761 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
5762 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
5763 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
5764 ; AVX512-NEXT: vpsrlq $16, %zmm29, %zmm6
5765 ; AVX512-NEXT: vpmovqw %zmm6, %xmm6
5766 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
5767 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
5768 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,1,3,4,5,6,7]
5769 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
5770 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5771 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm21[0,2,2,3]
5772 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
5773 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm20[0,2,2,3]
5774 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
5775 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
5776 ; AVX512-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
5777 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
5778 ; AVX512-NEXT: vpsrlq $16, %zmm28, %zmm6
5779 ; AVX512-NEXT: vpmovqw %zmm6, %xmm6
5780 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
5781 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,2,3]
5782 ; AVX512-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
5783 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,1,3,4,5,6,7]
5784 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,1,3,4,5,6,7]
5785 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5786 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
5787 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm19[0,2,2,3]
5788 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
5789 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm18[0,2,2,3]
5790 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
5791 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
5792 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5793 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
5794 ; AVX512-NEXT: vpsrlq $16, %zmm27, %zmm3
5795 ; AVX512-NEXT: vpmovqw %zmm3, %xmm3
5796 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
5797 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,1,3,4,5,6,7]
5798 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,1,3,4,5,6,7]
5799 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
5800 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5801 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm17[0,2,2,3]
5802 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
5803 ; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm30[0,2,2,3]
5804 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
5805 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5806 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
5807 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
5808 ; AVX512-NEXT: vpsrlq $16, %zmm26, %zmm4
5809 ; AVX512-NEXT: vpmovqw %zmm4, %xmm4
5810 ; AVX512-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
5811 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,2,3]
5812 ; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5813 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3]
5814 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,2,0,4,5,6,7]
5815 ; AVX512-NEXT: vmovdqa64 %xmm2, %xmm22
5816 ; AVX512-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3]
5817 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,2,0,4,5,6,7]
5818 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
5819 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5820 ; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[3,1,2,3]
5821 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,0,2,3,4,5,6,7]
5822 ; AVX512-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,1,2,3]
5823 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7]
5824 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
5825 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5826 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
5827 ; AVX512-NEXT: vpsrlq $32, %zmm29, %zmm1
5828 ; AVX512-NEXT: vpmovqw %zmm1, %xmm1
5829 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
5830 ; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,1,2,3]
5831 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7]
5832 ; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3]
5833 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,2,0,4,5,6,7]
5834 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
5835 ; AVX512-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
5836 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm21[3,1,2,3]
5837 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,0,2,3,4,5,6,7]
5838 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm21
5839 ; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm20[3,1,2,3]
5840 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,0,2,3,4,5,6,7]
5841 ; AVX512-NEXT: vmovdqa64 %xmm0, %xmm16
5842 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
5843 ; AVX512-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
5844 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
5845 ; AVX512-NEXT: vpsrlq $32, %zmm28, %zmm8
5846 ; AVX512-NEXT: vpmovqw %zmm8, %xmm8
5847 ; AVX512-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
5848 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm1[0,1,2,3]
5849 ; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
5850 ; AVX512-NEXT: vpshufd {{.*#+}} xmm10 = xmm24[3,1,2,3]
5851 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7]
5852 ; AVX512-NEXT: vpshufd {{.*#+}} xmm11 = xmm23[3,1,2,3]
5853 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,2,0,4,5,6,7]
5854 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm20 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
5855 ; AVX512-NEXT: vpshufd {{.*#+}} xmm12 = xmm19[3,1,2,3]
5856 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,0,2,3,4,5,6,7]
5857 ; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm18[3,1,2,3]
5858 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7]
5859 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm18 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
5860 ; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm31[3,1,2,3]
5861 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,1,2,0,4,5,6,7]
5862 ; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = xmm25[3,1,2,3]
5863 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[0,1,2,0,4,5,6,7]
5864 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm19 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
5865 ; AVX512-NEXT: vpshufd {{.*#+}} xmm8 = xmm17[3,1,2,3]
5866 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[2,0,2,3,4,5,6,7]
5867 ; AVX512-NEXT: vpshufd {{.*#+}} xmm9 = xmm30[3,1,2,3]
5868 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm14 = xmm9[2,0,2,3,4,5,6,7]
5869 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
5870 ; AVX512-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm14
5871 ; AVX512-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm5
5872 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7]
5873 ; AVX512-NEXT: vpsrlq $32, %zmm27, %zmm14
5874 ; AVX512-NEXT: vpmovqw %zmm14, %xmm14
5875 ; AVX512-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
5876 ; AVX512-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm14
5877 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5878 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7]
5879 ; AVX512-NEXT: vpsrlq $32, %zmm26, %zmm14
5880 ; AVX512-NEXT: vpmovqw %zmm14, %xmm14
5881 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
5882 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm0[0,1,2,3],zmm5[0,1,2,3]
5883 ; AVX512-NEXT: vmovdqa64 %xmm22, %xmm0
5884 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
5885 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[0,1,3,1,4,5,6,7]
5886 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
5887 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[3,1,2,3,4,5,6,7]
5888 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7]
5889 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
5890 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
5891 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
5892 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5893 ; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
5894 ; AVX512-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3
5895 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
5896 ; AVX512-NEXT: vpsrlq $48, %zmm29, %zmm3
5897 ; AVX512-NEXT: vpmovqw %zmm3, %xmm3
5898 ; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
5899 ; AVX512-NEXT: vmovdqa64 %xmm21, %xmm3
5900 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
5901 ; AVX512-NEXT: vmovdqa64 %xmm16, %xmm5
5902 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
5903 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
5904 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
5905 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5906 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
5907 ; AVX512-NEXT: vpsrlq $48, %zmm28, %zmm3
5908 ; AVX512-NEXT: vpmovqw %zmm3, %xmm3
5909 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
5910 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,2,3]
5911 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,1,3,1,4,5,6,7]
5912 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,3,1,4,5,6,7]
5913 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
5914 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7]
5915 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
5916 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
5917 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,3,1,4,5,6,7]
5918 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,3,1,4,5,6,7]
5919 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
5920 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
5921 ; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
5922 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
5923 ; AVX512-NEXT: vpsrlq $48, %zmm27, %zmm2
5924 ; AVX512-NEXT: vpmovqw %zmm2, %xmm2
5925 ; AVX512-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
5926 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[3,1,2,3,4,5,6,7]
5927 ; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7]
5928 ; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
5929 ; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
5930 ; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
5931 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
5932 ; AVX512-NEXT: vpsrlq $48, %zmm26, %zmm3
5933 ; AVX512-NEXT: vpmovqw %zmm3, %xmm3
5934 ; AVX512-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
5935 ; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[0,1,2,3]
5936 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
5937 ; AVX512-NEXT: vmovaps %zmm2, 64(%rsi)
5938 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
5939 ; AVX512-NEXT: vmovaps %zmm2, (%rsi)
5940 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
5941 ; AVX512-NEXT: vmovaps %zmm2, 64(%rdx)
5942 ; AVX512-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload
5943 ; AVX512-NEXT: vmovaps %zmm2, (%rdx)
5944 ; AVX512-NEXT: vmovdqa64 %zmm17, 64(%rcx)
5945 ; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
5946 ; AVX512-NEXT: vmovaps %zmm2, (%rcx)
5947 ; AVX512-NEXT: vmovdqa64 %zmm1, 64(%r8)
5948 ; AVX512-NEXT: vmovdqa64 %zmm0, (%r8)
5949 ; AVX512-NEXT: addq $200, %rsp
5950 ; AVX512-NEXT: vzeroupper
5953 ; AVX512-FCP-LABEL: load_i16_stride4_vf64:
5954 ; AVX512-FCP: # %bb.0:
5955 ; AVX512-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22
5956 ; AVX512-FCP-NEXT: vmovdqa64 384(%rdi), %zmm25
5957 ; AVX512-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
5958 ; AVX512-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
5959 ; AVX512-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
5960 ; AVX512-FCP-NEXT: # ymm5 = mem[0,1,0,1]
5961 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,2,2,3,0,2,4,6]
5962 ; AVX512-FCP-NEXT: vmovdqa64 224(%rdi), %ymm23
5963 ; AVX512-FCP-NEXT: vpermd %ymm23, %ymm9, %ymm10
5964 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm0
5965 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
5966 ; AVX512-FCP-NEXT: vmovdqa64 192(%rdi), %ymm24
5967 ; AVX512-FCP-NEXT: vpermd %ymm24, %ymm9, %ymm3
5968 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm1
5969 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,2,2,3,4,6,12,14]
5970 ; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm7, %ymm1
5971 ; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm0
5972 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
5973 ; AVX512-FCP-NEXT: vmovdqa64 96(%rdi), %ymm26
5974 ; AVX512-FCP-NEXT: vpermd %ymm26, %ymm9, %ymm8
5975 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm11
5976 ; AVX512-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27
5977 ; AVX512-FCP-NEXT: vpermd %ymm27, %ymm9, %ymm0
5978 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm12
5979 ; AVX512-FCP-NEXT: vpermt2d %ymm11, %ymm7, %ymm12
5980 ; AVX512-FCP-NEXT: vpmovqw %zmm2, %xmm11
5981 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
5982 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm11[0,1,2,3],zmm1[0,1,2,3]
5983 ; AVX512-FCP-NEXT: vmovdqa64 480(%rdi), %ymm28
5984 ; AVX512-FCP-NEXT: vpermd %ymm28, %ymm9, %ymm11
5985 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm1
5986 ; AVX512-FCP-NEXT: vmovdqa64 448(%rdi), %ymm17
5987 ; AVX512-FCP-NEXT: vpermd %ymm17, %ymm9, %ymm12
5988 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm13
5989 ; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm7, %ymm13
5990 ; AVX512-FCP-NEXT: vpmovqw %zmm25, %xmm1
5991 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm13[4,5,6,7]
5992 ; AVX512-FCP-NEXT: vmovdqa64 352(%rdi), %ymm18
5993 ; AVX512-FCP-NEXT: vpermd %ymm18, %ymm9, %ymm1
5994 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm15
5995 ; AVX512-FCP-NEXT: vmovdqa64 320(%rdi), %ymm19
5996 ; AVX512-FCP-NEXT: vpermd %ymm19, %ymm9, %ymm13
5997 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm9
5998 ; AVX512-FCP-NEXT: vpermt2d %ymm15, %ymm7, %ymm9
5999 ; AVX512-FCP-NEXT: vpmovqw %zmm22, %xmm15
6000 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7]
6001 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm9[0,1,2,3],zmm14[0,1,2,3]
6002 ; AVX512-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
6003 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm10
6004 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
6005 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm10[6,7]
6006 ; AVX512-FCP-NEXT: vpsrlq $16, %zmm4, %zmm10
6007 ; AVX512-FCP-NEXT: vpmovqw %zmm10, %xmm10
6008 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7]
6009 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm8
6010 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
6011 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7]
6012 ; AVX512-FCP-NEXT: vpsrlq $16, %zmm2, %zmm8
6013 ; AVX512-FCP-NEXT: vpmovqw %zmm8, %xmm8
6014 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
6015 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm0[0,1,2,3],zmm3[0,1,2,3]
6016 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm0
6017 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm3
6018 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
6019 ; AVX512-FCP-NEXT: vpsrlq $16, %zmm25, %zmm3
6020 ; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3
6021 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
6022 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1
6023 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm3
6024 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
6025 ; AVX512-FCP-NEXT: vpsrlq $16, %zmm22, %zmm3
6026 ; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3
6027 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
6028 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm1[0,1,2,3],zmm0[0,1,2,3]
6029 ; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,3,2,3,1,3,5,7]
6030 ; AVX512-FCP-NEXT: vpermd %ymm23, %ymm14, %ymm3
6031 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm0
6032 ; AVX512-FCP-NEXT: vpermd %ymm24, %ymm14, %ymm8
6033 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm1
6034 ; AVX512-FCP-NEXT: vpermt2d %ymm0, %ymm7, %ymm1
6035 ; AVX512-FCP-NEXT: vpsrlq $32, %zmm4, %zmm0
6036 ; AVX512-FCP-NEXT: vpmovqw %zmm0, %xmm0
6037 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6038 ; AVX512-FCP-NEXT: vpermd %ymm26, %ymm14, %ymm0
6039 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm12
6040 ; AVX512-FCP-NEXT: vpermd %ymm27, %ymm14, %ymm11
6041 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm13
6042 ; AVX512-FCP-NEXT: vpermt2d %ymm12, %ymm7, %ymm13
6043 ; AVX512-FCP-NEXT: vpsrlq $32, %zmm2, %zmm12
6044 ; AVX512-FCP-NEXT: vpmovqw %zmm12, %xmm12
6045 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
6046 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm12[0,1,2,3],zmm1[0,1,2,3]
6047 ; AVX512-FCP-NEXT: vpermd %ymm28, %ymm14, %ymm12
6048 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm1
6049 ; AVX512-FCP-NEXT: vpermd %ymm17, %ymm14, %ymm13
6050 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm10
6051 ; AVX512-FCP-NEXT: vpermt2d %ymm1, %ymm7, %ymm10
6052 ; AVX512-FCP-NEXT: vpsrlq $32, %zmm25, %zmm1
6053 ; AVX512-FCP-NEXT: vpmovqw %zmm1, %xmm1
6054 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7]
6055 ; AVX512-FCP-NEXT: vpermd %ymm18, %ymm14, %ymm1
6056 ; AVX512-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm15
6057 ; AVX512-FCP-NEXT: vpermd %ymm19, %ymm14, %ymm5
6058 ; AVX512-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm6
6059 ; AVX512-FCP-NEXT: vpermt2d %ymm15, %ymm7, %ymm6
6060 ; AVX512-FCP-NEXT: vpsrlq $32, %zmm22, %zmm7
6061 ; AVX512-FCP-NEXT: vpmovqw %zmm7, %xmm7
6062 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
6063 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm10[0,1,2,3]
6064 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
6065 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm7
6066 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7]
6067 ; AVX512-FCP-NEXT: vpsrlq $48, %zmm4, %zmm4
6068 ; AVX512-FCP-NEXT: vpmovqw %zmm4, %xmm4
6069 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
6070 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
6071 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm4
6072 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
6073 ; AVX512-FCP-NEXT: vpsrlq $48, %zmm2, %zmm2
6074 ; AVX512-FCP-NEXT: vpmovqw %zmm2, %xmm2
6075 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6076 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
6077 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm2
6078 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm3
6079 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
6080 ; AVX512-FCP-NEXT: vpsrlq $48, %zmm25, %zmm3
6081 ; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3
6082 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
6083 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1
6084 ; AVX512-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm3
6085 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
6086 ; AVX512-FCP-NEXT: vpsrlq $48, %zmm22, %zmm3
6087 ; AVX512-FCP-NEXT: vpmovqw %zmm3, %xmm3
6088 ; AVX512-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
6089 ; AVX512-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[0,1,2,3]
6090 ; AVX512-FCP-NEXT: vmovdqa64 %zmm21, 64(%rsi)
6091 ; AVX512-FCP-NEXT: vmovdqa64 %zmm20, (%rsi)
6092 ; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx)
6093 ; AVX512-FCP-NEXT: vmovdqa64 %zmm29, (%rdx)
6094 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, 64(%rcx)
6095 ; AVX512-FCP-NEXT: vmovdqa64 %zmm23, (%rcx)
6096 ; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 64(%r8)
6097 ; AVX512-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
6098 ; AVX512-FCP-NEXT: vzeroupper
6099 ; AVX512-FCP-NEXT: retq
6101 ; AVX512DQ-LABEL: load_i16_stride4_vf64:
6102 ; AVX512DQ: # %bb.0:
6103 ; AVX512DQ-NEXT: subq $200, %rsp
6104 ; AVX512DQ-NEXT: vmovdqa64 256(%rdi), %zmm26
6105 ; AVX512DQ-NEXT: vmovdqa64 384(%rdi), %zmm27
6106 ; AVX512DQ-NEXT: vmovdqa64 (%rdi), %zmm28
6107 ; AVX512DQ-NEXT: vmovdqa64 128(%rdi), %zmm29
6108 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %ymm0
6109 ; AVX512DQ-NEXT: vpmovqw %ymm0, %xmm0
6110 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6111 ; AVX512DQ-NEXT: vmovdqa 240(%rdi), %xmm14
6112 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm14[0,2,2,3]
6113 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm9[0,1,0,2,4,5,6,7]
6114 ; AVX512DQ-NEXT: vmovdqa 224(%rdi), %xmm13
6115 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm13[0,2,2,3]
6116 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm6[0,1,0,2,4,5,6,7]
6117 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
6118 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6119 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6120 ; AVX512DQ-NEXT: vpmovqw %zmm29, %xmm1
6121 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6122 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm1
6123 ; AVX512DQ-NEXT: vpmovqw %ymm1, %xmm1
6124 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6125 ; AVX512DQ-NEXT: vmovdqa 112(%rdi), %xmm12
6126 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm12[0,2,2,3]
6127 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm3[0,1,0,2,4,5,6,7]
6128 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm11
6129 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm11[0,2,2,3]
6130 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm7[0,1,0,2,4,5,6,7]
6131 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
6132 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
6133 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
6134 ; AVX512DQ-NEXT: vpmovqw %zmm28, %xmm4
6135 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm1[4,5,6,7]
6136 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,2,3],zmm0[0,1,2,3]
6137 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6138 ; AVX512DQ-NEXT: vmovdqa 448(%rdi), %ymm0
6139 ; AVX512DQ-NEXT: vpmovqw %ymm0, %xmm0
6140 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6141 ; AVX512DQ-NEXT: vmovdqa64 496(%rdi), %xmm24
6142 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm24[0,2,2,3]
6143 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[0,1,0,2,4,5,6,7]
6144 ; AVX512DQ-NEXT: vmovdqa64 480(%rdi), %xmm23
6145 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm23[0,2,2,3]
6146 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm15[0,1,0,2,4,5,6,7]
6147 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
6148 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6149 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
6150 ; AVX512DQ-NEXT: vpmovqw %zmm27, %xmm1
6151 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6152 ; AVX512DQ-NEXT: vmovdqa 320(%rdi), %ymm1
6153 ; AVX512DQ-NEXT: vpmovqw %ymm1, %xmm1
6154 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6155 ; AVX512DQ-NEXT: vmovdqa64 368(%rdi), %xmm31
6156 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm31[0,2,2,3]
6157 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm5[0,1,0,2,4,5,6,7]
6158 ; AVX512DQ-NEXT: vmovdqa64 352(%rdi), %xmm25
6159 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm25[0,2,2,3]
6160 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm10 = xmm8[0,1,0,2,4,5,6,7]
6161 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
6162 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6163 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
6164 ; AVX512DQ-NEXT: vpmovqw %zmm26, %xmm1
6165 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6166 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[0,1,2,3]
6167 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6168 ; AVX512DQ-NEXT: vmovdqa64 320(%rdi), %xmm30
6169 ; AVX512DQ-NEXT: vmovdqa64 336(%rdi), %xmm17
6170 ; AVX512DQ-NEXT: vmovdqa64 448(%rdi), %xmm18
6171 ; AVX512DQ-NEXT: vmovdqa64 464(%rdi), %xmm19
6172 ; AVX512DQ-NEXT: vmovdqa64 64(%rdi), %xmm20
6173 ; AVX512DQ-NEXT: vmovdqa64 80(%rdi), %xmm21
6174 ; AVX512DQ-NEXT: vmovdqa 192(%rdi), %xmm0
6175 ; AVX512DQ-NEXT: vmovdqa 208(%rdi), %xmm1
6176 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm9[0,1,1,3,4,5,6,7]
6177 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
6178 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1]
6179 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6180 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[0,2,2,3]
6181 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
6182 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm0[0,2,2,3]
6183 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
6184 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm9[0],xmm6[0],xmm9[1],xmm6[1]
6185 ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
6186 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3,4,5],ymm2[6,7]
6187 ; AVX512DQ-NEXT: vpsrlq $16, %zmm29, %zmm6
6188 ; AVX512DQ-NEXT: vpmovqw %zmm6, %xmm6
6189 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm6[0,1,2,3],ymm2[4,5,6,7]
6190 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
6191 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[0,1,1,3,4,5,6,7]
6192 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
6193 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
6194 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm21[0,2,2,3]
6195 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
6196 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm20[0,2,2,3]
6197 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
6198 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
6199 ; AVX512DQ-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm6
6200 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
6201 ; AVX512DQ-NEXT: vpsrlq $16, %zmm28, %zmm6
6202 ; AVX512DQ-NEXT: vpmovqw %zmm6, %xmm6
6203 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3],ymm3[4,5,6,7]
6204 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,2,3]
6205 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, (%rsp) # 64-byte Spill
6206 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm4[0,1,1,3,4,5,6,7]
6207 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm15[0,1,1,3,4,5,6,7]
6208 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
6209 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6210 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm19[0,2,2,3]
6211 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
6212 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm18[0,2,2,3]
6213 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
6214 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
6215 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
6216 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
6217 ; AVX512DQ-NEXT: vpsrlq $16, %zmm27, %zmm3
6218 ; AVX512DQ-NEXT: vpmovqw %zmm3, %xmm3
6219 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
6220 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm5[0,1,1,3,4,5,6,7]
6221 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm8[0,1,1,3,4,5,6,7]
6222 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
6223 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
6224 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm17[0,2,2,3]
6225 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
6226 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm5 = xmm30[0,2,2,3]
6227 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
6228 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
6229 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
6230 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
6231 ; AVX512DQ-NEXT: vpsrlq $16, %zmm26, %zmm4
6232 ; AVX512DQ-NEXT: vpmovqw %zmm4, %xmm4
6233 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
6234 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm3[0,1,2,3],zmm2[0,1,2,3]
6235 ; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6236 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm14[3,1,2,3]
6237 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[0,1,2,0,4,5,6,7]
6238 ; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm22
6239 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm13 = xmm13[3,1,2,3]
6240 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm13[0,1,2,0,4,5,6,7]
6241 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
6242 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
6243 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm6 = xmm1[3,1,2,3]
6244 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm6[2,0,2,3,4,5,6,7]
6245 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,1,2,3]
6246 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm7[2,0,2,3,4,5,6,7]
6247 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
6248 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6249 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6,7]
6250 ; AVX512DQ-NEXT: vpsrlq $32, %zmm29, %zmm1
6251 ; AVX512DQ-NEXT: vpmovqw %zmm1, %xmm1
6252 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
6253 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = xmm12[3,1,2,3]
6254 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[0,1,2,0,4,5,6,7]
6255 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3]
6256 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm3[0,1,2,0,4,5,6,7]
6257 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
6258 ; AVX512DQ-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4
6259 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm21[3,1,2,3]
6260 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm0[2,0,2,3,4,5,6,7]
6261 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm21
6262 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm20[3,1,2,3]
6263 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm0[2,0,2,3,4,5,6,7]
6264 ; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm16
6265 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
6266 ; AVX512DQ-NEXT: vinserti128 $1, %xmm8, %ymm0, %ymm8
6267 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4,5],ymm4[6,7]
6268 ; AVX512DQ-NEXT: vpsrlq $32, %zmm28, %zmm8
6269 ; AVX512DQ-NEXT: vpmovqw %zmm8, %xmm8
6270 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3],ymm4[4,5,6,7]
6271 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm1[0,1,2,3]
6272 ; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
6273 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm10 = xmm24[3,1,2,3]
6274 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm10[0,1,2,0,4,5,6,7]
6275 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm11 = xmm23[3,1,2,3]
6276 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm11[0,1,2,0,4,5,6,7]
6277 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm20 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
6278 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm12 = xmm19[3,1,2,3]
6279 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm12[2,0,2,3,4,5,6,7]
6280 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm1 = xmm18[3,1,2,3]
6281 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm1[2,0,2,3,4,5,6,7]
6282 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm18 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
6283 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm4 = xmm31[3,1,2,3]
6284 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm8 = xmm4[0,1,2,0,4,5,6,7]
6285 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = xmm25[3,1,2,3]
6286 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm9 = xmm15[0,1,2,0,4,5,6,7]
6287 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm19 = xmm9[0],xmm8[0],xmm9[1],xmm8[1]
6288 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm8 = xmm17[3,1,2,3]
6289 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm8[2,0,2,3,4,5,6,7]
6290 ; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm9 = xmm30[3,1,2,3]
6291 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm14 = xmm9[2,0,2,3,4,5,6,7]
6292 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
6293 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm14
6294 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm5
6295 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm14[6,7]
6296 ; AVX512DQ-NEXT: vpsrlq $32, %zmm27, %zmm14
6297 ; AVX512DQ-NEXT: vpmovqw %zmm14, %xmm14
6298 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm5 = ymm14[0,1,2,3],ymm5[4,5,6,7]
6299 ; AVX512DQ-NEXT: vinserti32x4 $1, %xmm19, %ymm0, %ymm14
6300 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6301 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm14[6,7]
6302 ; AVX512DQ-NEXT: vpsrlq $32, %zmm26, %zmm14
6303 ; AVX512DQ-NEXT: vpmovqw %zmm14, %xmm14
6304 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm14[0,1,2,3],ymm0[4,5,6,7]
6305 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm17 = zmm0[0,1,2,3],zmm5[0,1,2,3]
6306 ; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm0
6307 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
6308 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm13[0,1,3,1,4,5,6,7]
6309 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
6310 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm6[3,1,2,3,4,5,6,7]
6311 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm6 = xmm7[3,1,2,3,4,5,6,7]
6312 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm5 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
6313 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
6314 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
6315 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
6316 ; AVX512DQ-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0
6317 ; AVX512DQ-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm3
6318 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
6319 ; AVX512DQ-NEXT: vpsrlq $48, %zmm29, %zmm3
6320 ; AVX512DQ-NEXT: vpmovqw %zmm3, %xmm3
6321 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
6322 ; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm3
6323 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
6324 ; AVX512DQ-NEXT: vmovdqa64 %xmm16, %xmm5
6325 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
6326 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
6327 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6328 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
6329 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
6330 ; AVX512DQ-NEXT: vpsrlq $48, %zmm28, %zmm3
6331 ; AVX512DQ-NEXT: vpmovqw %zmm3, %xmm3
6332 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
6333 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm2[0,1,2,3],zmm0[0,1,2,3]
6334 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm10[0,1,3,1,4,5,6,7]
6335 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm11[0,1,3,1,4,5,6,7]
6336 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
6337 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm12[3,1,2,3,4,5,6,7]
6338 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
6339 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
6340 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm3 = xmm4[0,1,3,1,4,5,6,7]
6341 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm15[0,1,3,1,4,5,6,7]
6342 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
6343 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6344 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
6345 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
6346 ; AVX512DQ-NEXT: vpsrlq $48, %zmm27, %zmm2
6347 ; AVX512DQ-NEXT: vpmovqw %zmm2, %xmm2
6348 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
6349 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm2 = xmm8[3,1,2,3,4,5,6,7]
6350 ; AVX512DQ-NEXT: vpshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7]
6351 ; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
6352 ; AVX512DQ-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3
6353 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
6354 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6,7]
6355 ; AVX512DQ-NEXT: vpsrlq $48, %zmm26, %zmm3
6356 ; AVX512DQ-NEXT: vpmovqw %zmm3, %xmm3
6357 ; AVX512DQ-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
6358 ; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[0,1,2,3],zmm1[0,1,2,3]
6359 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6360 ; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rsi)
6361 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6362 ; AVX512DQ-NEXT: vmovaps %zmm2, (%rsi)
6363 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6364 ; AVX512DQ-NEXT: vmovaps %zmm2, 64(%rdx)
6365 ; AVX512DQ-NEXT: vmovups (%rsp), %zmm2 # 64-byte Reload
6366 ; AVX512DQ-NEXT: vmovaps %zmm2, (%rdx)
6367 ; AVX512DQ-NEXT: vmovdqa64 %zmm17, 64(%rcx)
6368 ; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
6369 ; AVX512DQ-NEXT: vmovaps %zmm2, (%rcx)
6370 ; AVX512DQ-NEXT: vmovdqa64 %zmm1, 64(%r8)
6371 ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%r8)
6372 ; AVX512DQ-NEXT: addq $200, %rsp
6373 ; AVX512DQ-NEXT: vzeroupper
6374 ; AVX512DQ-NEXT: retq
6376 ; AVX512DQ-FCP-LABEL: load_i16_stride4_vf64:
6377 ; AVX512DQ-FCP: # %bb.0:
6378 ; AVX512DQ-FCP-NEXT: vmovdqa64 256(%rdi), %zmm22
6379 ; AVX512DQ-FCP-NEXT: vmovdqa64 384(%rdi), %zmm25
6380 ; AVX512DQ-FCP-NEXT: vmovdqa64 (%rdi), %zmm2
6381 ; AVX512DQ-FCP-NEXT: vmovdqa64 128(%rdi), %zmm4
6382 ; AVX512DQ-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0,16,17,20,21,0,0,0,0,24,25,28,29,0,0,0,0]
6383 ; AVX512DQ-FCP-NEXT: # ymm5 = mem[0,1,0,1]
6384 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,2,2,3,0,2,4,6]
6385 ; AVX512DQ-FCP-NEXT: vmovdqa64 224(%rdi), %ymm23
6386 ; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm9, %ymm10
6387 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm10, %ymm0
6388 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
6389 ; AVX512DQ-FCP-NEXT: vmovdqa64 192(%rdi), %ymm24
6390 ; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm9, %ymm3
6391 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm1
6392 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [0,2,2,3,4,6,12,14]
6393 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm7, %ymm1
6394 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm0
6395 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6396 ; AVX512DQ-FCP-NEXT: vmovdqa64 96(%rdi), %ymm26
6397 ; AVX512DQ-FCP-NEXT: vpermd %ymm26, %ymm9, %ymm8
6398 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm8, %ymm11
6399 ; AVX512DQ-FCP-NEXT: vmovdqa64 64(%rdi), %ymm27
6400 ; AVX512DQ-FCP-NEXT: vpermd %ymm27, %ymm9, %ymm0
6401 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm0, %ymm12
6402 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm11, %ymm7, %ymm12
6403 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm2, %xmm11
6404 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
6405 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm20 = zmm11[0,1,2,3],zmm1[0,1,2,3]
6406 ; AVX512DQ-FCP-NEXT: vmovdqa64 480(%rdi), %ymm28
6407 ; AVX512DQ-FCP-NEXT: vpermd %ymm28, %ymm9, %ymm11
6408 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm11, %ymm1
6409 ; AVX512DQ-FCP-NEXT: vmovdqa64 448(%rdi), %ymm17
6410 ; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm9, %ymm12
6411 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm12, %ymm13
6412 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm7, %ymm13
6413 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm25, %xmm1
6414 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm1[0,1,2,3],ymm13[4,5,6,7]
6415 ; AVX512DQ-FCP-NEXT: vmovdqa64 352(%rdi), %ymm18
6416 ; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm9, %ymm1
6417 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm15
6418 ; AVX512DQ-FCP-NEXT: vmovdqa64 320(%rdi), %ymm19
6419 ; AVX512DQ-FCP-NEXT: vpermd %ymm19, %ymm9, %ymm13
6420 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm9
6421 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm15, %ymm7, %ymm9
6422 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm22, %xmm15
6423 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm15[0,1,2,3],ymm9[4,5,6,7]
6424 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm9[0,1,2,3],zmm14[0,1,2,3]
6425 ; AVX512DQ-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
6426 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm10, %ymm10
6427 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
6428 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm10[6,7]
6429 ; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm4, %zmm10
6430 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm10, %xmm10
6431 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm10[0,1,2,3],ymm3[4,5,6,7]
6432 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm8
6433 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
6434 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm8[6,7]
6435 ; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm2, %zmm8
6436 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm8, %xmm8
6437 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm8[0,1,2,3],ymm0[4,5,6,7]
6438 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm29 = zmm0[0,1,2,3],zmm3[0,1,2,3]
6439 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm0
6440 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm3
6441 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5],ymm0[6,7]
6442 ; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm25, %zmm3
6443 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3
6444 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
6445 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1
6446 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm3
6447 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
6448 ; AVX512DQ-FCP-NEXT: vpsrlq $16, %zmm22, %zmm3
6449 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3
6450 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
6451 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm16 = zmm1[0,1,2,3],zmm0[0,1,2,3]
6452 ; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [1,3,2,3,1,3,5,7]
6453 ; AVX512DQ-FCP-NEXT: vpermd %ymm23, %ymm14, %ymm3
6454 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm0
6455 ; AVX512DQ-FCP-NEXT: vpermd %ymm24, %ymm14, %ymm8
6456 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm8, %ymm1
6457 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm0, %ymm7, %ymm1
6458 ; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm4, %zmm0
6459 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm0, %xmm0
6460 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
6461 ; AVX512DQ-FCP-NEXT: vpermd %ymm26, %ymm14, %ymm0
6462 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm0, %ymm12
6463 ; AVX512DQ-FCP-NEXT: vpermd %ymm27, %ymm14, %ymm11
6464 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm11, %ymm13
6465 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm12, %ymm7, %ymm13
6466 ; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm2, %zmm12
6467 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm12, %xmm12
6468 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3],ymm13[4,5,6,7]
6469 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm23 = zmm12[0,1,2,3],zmm1[0,1,2,3]
6470 ; AVX512DQ-FCP-NEXT: vpermd %ymm28, %ymm14, %ymm12
6471 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm12, %ymm1
6472 ; AVX512DQ-FCP-NEXT: vpermd %ymm17, %ymm14, %ymm13
6473 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm13, %ymm10
6474 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm1, %ymm7, %ymm10
6475 ; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm25, %zmm1
6476 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm1, %xmm1
6477 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm10[4,5,6,7]
6478 ; AVX512DQ-FCP-NEXT: vpermd %ymm18, %ymm14, %ymm1
6479 ; AVX512DQ-FCP-NEXT: vpshufb %ymm5, %ymm1, %ymm15
6480 ; AVX512DQ-FCP-NEXT: vpermd %ymm19, %ymm14, %ymm5
6481 ; AVX512DQ-FCP-NEXT: vpshufb %ymm6, %ymm5, %ymm6
6482 ; AVX512DQ-FCP-NEXT: vpermt2d %ymm15, %ymm7, %ymm6
6483 ; AVX512DQ-FCP-NEXT: vpsrlq $32, %zmm22, %zmm7
6484 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm7, %xmm7
6485 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
6486 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm6 = zmm6[0,1,2,3],zmm10[0,1,2,3]
6487 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm3, %ymm3
6488 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm7
6489 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7]
6490 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm4, %zmm4
6491 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm4, %xmm4
6492 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
6493 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm0
6494 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm11, %ymm4
6495 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm4[0,1,2,3,4,5],ymm0[6,7]
6496 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm2, %zmm2
6497 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm2, %xmm2
6498 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
6499 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
6500 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm12, %ymm2
6501 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm13, %ymm3
6502 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm2[6,7]
6503 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm25, %zmm3
6504 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3
6505 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
6506 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm1
6507 ; AVX512DQ-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm3
6508 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm1[6,7]
6509 ; AVX512DQ-FCP-NEXT: vpsrlq $48, %zmm22, %zmm3
6510 ; AVX512DQ-FCP-NEXT: vpmovqw %zmm3, %xmm3
6511 ; AVX512DQ-FCP-NEXT: vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
6512 ; AVX512DQ-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[0,1,2,3],zmm2[0,1,2,3]
6513 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, 64(%rsi)
6514 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, (%rsi)
6515 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 64(%rdx)
6516 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, (%rdx)
6517 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, 64(%rcx)
6518 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, (%rcx)
6519 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 64(%r8)
6520 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
6521 ; AVX512DQ-FCP-NEXT: vzeroupper
6522 ; AVX512DQ-FCP-NEXT: retq
6524 ; AVX512BW-LABEL: load_i16_stride4_vf64:
6525 ; AVX512BW: # %bb.0:
6526 ; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
6527 ; AVX512BW-NEXT: vmovdqa64 64(%rdi), %zmm1
6528 ; AVX512BW-NEXT: vmovdqa64 128(%rdi), %zmm2
6529 ; AVX512BW-NEXT: vmovdqa64 192(%rdi), %zmm3
6530 ; AVX512BW-NEXT: vmovdqa64 320(%rdi), %zmm4
6531 ; AVX512BW-NEXT: vmovdqa64 256(%rdi), %zmm5
6532 ; AVX512BW-NEXT: vmovdqa64 448(%rdi), %zmm6
6533 ; AVX512BW-NEXT: vmovdqa64 384(%rdi), %zmm7
6534 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
6535 ; AVX512BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6536 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm9
6537 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm8, %zmm9
6538 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm10
6539 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm10
6540 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7]
6541 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm10
6542 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm8, %zmm10
6543 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
6544 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7]
6545 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
6546 ; AVX512BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
6547 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm11
6548 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm10, %zmm11
6549 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm12
6550 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm12
6551 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7]
6552 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm12
6553 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm10, %zmm12
6554 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
6555 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7]
6556 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
6557 ; AVX512BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
6558 ; AVX512BW-NEXT: vmovdqa64 %zmm7, %zmm13
6559 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm12, %zmm13
6560 ; AVX512BW-NEXT: vmovdqa64 %zmm5, %zmm14
6561 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm12, %zmm14
6562 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7]
6563 ; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm14
6564 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm12, %zmm14
6565 ; AVX512BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm12
6566 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7]
6567 ; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
6568 ; AVX512BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
6569 ; AVX512BW-NEXT: vpermt2w %zmm6, %zmm14, %zmm7
6570 ; AVX512BW-NEXT: vpermt2w %zmm4, %zmm14, %zmm5
6571 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
6572 ; AVX512BW-NEXT: vpermt2w %zmm3, %zmm14, %zmm2
6573 ; AVX512BW-NEXT: vpermt2w %zmm1, %zmm14, %zmm0
6574 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
6575 ; AVX512BW-NEXT: vmovdqa64 %zmm9, 64(%rsi)
6576 ; AVX512BW-NEXT: vmovdqa64 %zmm8, (%rsi)
6577 ; AVX512BW-NEXT: vmovdqa64 %zmm11, 64(%rdx)
6578 ; AVX512BW-NEXT: vmovdqa64 %zmm10, (%rdx)
6579 ; AVX512BW-NEXT: vmovdqa64 %zmm13, 64(%rcx)
6580 ; AVX512BW-NEXT: vmovdqa64 %zmm12, (%rcx)
6581 ; AVX512BW-NEXT: vmovdqa64 %zmm4, 64(%r8)
6582 ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%r8)
6583 ; AVX512BW-NEXT: vzeroupper
6584 ; AVX512BW-NEXT: retq
6586 ; AVX512BW-FCP-LABEL: load_i16_stride4_vf64:
6587 ; AVX512BW-FCP: # %bb.0:
6588 ; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
6589 ; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
6590 ; AVX512BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
6591 ; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
6592 ; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4
6593 ; AVX512BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5
6594 ; AVX512BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6
6595 ; AVX512BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7
6596 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
6597 ; AVX512BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6598 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9
6599 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm8, %zmm9
6600 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10
6601 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm10
6602 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7]
6603 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
6604 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm8, %zmm10
6605 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
6606 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7]
6607 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
6608 ; AVX512BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
6609 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11
6610 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm10, %zmm11
6611 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12
6612 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm12
6613 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7]
6614 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
6615 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm10, %zmm12
6616 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
6617 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7]
6618 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
6619 ; AVX512BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
6620 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13
6621 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm12, %zmm13
6622 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14
6623 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm12, %zmm14
6624 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7]
6625 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14
6626 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm12, %zmm14
6627 ; AVX512BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm12
6628 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7]
6629 ; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
6630 ; AVX512BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
6631 ; AVX512BW-FCP-NEXT: vpermt2w %zmm6, %zmm14, %zmm7
6632 ; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm14, %zmm5
6633 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
6634 ; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm14, %zmm2
6635 ; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm14, %zmm0
6636 ; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
6637 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi)
6638 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi)
6639 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx)
6640 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx)
6641 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx)
6642 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx)
6643 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8)
6644 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
6645 ; AVX512BW-FCP-NEXT: vzeroupper
6646 ; AVX512BW-FCP-NEXT: retq
6648 ; AVX512DQ-BW-LABEL: load_i16_stride4_vf64:
6649 ; AVX512DQ-BW: # %bb.0:
6650 ; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %zmm0
6651 ; AVX512DQ-BW-NEXT: vmovdqa64 64(%rdi), %zmm1
6652 ; AVX512DQ-BW-NEXT: vmovdqa64 128(%rdi), %zmm2
6653 ; AVX512DQ-BW-NEXT: vmovdqa64 192(%rdi), %zmm3
6654 ; AVX512DQ-BW-NEXT: vmovdqa64 320(%rdi), %zmm4
6655 ; AVX512DQ-BW-NEXT: vmovdqa64 256(%rdi), %zmm5
6656 ; AVX512DQ-BW-NEXT: vmovdqa64 448(%rdi), %zmm6
6657 ; AVX512DQ-BW-NEXT: vmovdqa64 384(%rdi), %zmm7
6658 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
6659 ; AVX512DQ-BW-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6660 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm9
6661 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm8, %zmm9
6662 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm10
6663 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm8, %zmm10
6664 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7]
6665 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm10
6666 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm8, %zmm10
6667 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
6668 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7]
6669 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
6670 ; AVX512DQ-BW-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
6671 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm11
6672 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm10, %zmm11
6673 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm12
6674 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm10, %zmm12
6675 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7]
6676 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm12
6677 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm10, %zmm12
6678 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
6679 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7]
6680 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
6681 ; AVX512DQ-BW-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
6682 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, %zmm13
6683 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm12, %zmm13
6684 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, %zmm14
6685 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm12, %zmm14
6686 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7]
6687 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, %zmm14
6688 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm12, %zmm14
6689 ; AVX512DQ-BW-NEXT: vpermi2w %zmm1, %zmm0, %zmm12
6690 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7]
6691 ; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
6692 ; AVX512DQ-BW-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
6693 ; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm14, %zmm7
6694 ; AVX512DQ-BW-NEXT: vpermt2w %zmm4, %zmm14, %zmm5
6695 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
6696 ; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm14, %zmm2
6697 ; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm14, %zmm0
6698 ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
6699 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 64(%rsi)
6700 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm8, (%rsi)
6701 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 64(%rdx)
6702 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, (%rdx)
6703 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm13, 64(%rcx)
6704 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm12, (%rcx)
6705 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 64(%r8)
6706 ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%r8)
6707 ; AVX512DQ-BW-NEXT: vzeroupper
6708 ; AVX512DQ-BW-NEXT: retq
6710 ; AVX512DQ-BW-FCP-LABEL: load_i16_stride4_vf64:
6711 ; AVX512DQ-BW-FCP: # %bb.0:
6712 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %zmm0
6713 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm1
6714 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 128(%rdi), %zmm2
6715 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %zmm3
6716 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm4
6717 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 256(%rdi), %zmm5
6718 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 448(%rdi), %zmm6
6719 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 384(%rdi), %zmm7
6720 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60,0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60]
6721 ; AVX512DQ-BW-FCP-NEXT: # zmm8 = mem[0,1,2,3,0,1,2,3]
6722 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm9
6723 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm8, %zmm9
6724 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm10
6725 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm8, %zmm10
6726 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm9 = zmm10[0,1,2,3],zmm9[4,5,6,7]
6727 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm10
6728 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm8, %zmm10
6729 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm8
6730 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],zmm10[4,5,6,7]
6731 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm10 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61,1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
6732 ; AVX512DQ-BW-FCP-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3]
6733 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm11
6734 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm10, %zmm11
6735 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm12
6736 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm10, %zmm12
6737 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm11 = zmm12[0,1,2,3],zmm11[4,5,6,7]
6738 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm12
6739 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm10, %zmm12
6740 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm10
6741 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm10 = zmm10[0,1,2,3],zmm12[4,5,6,7]
6742 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm12 = [2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62,2,6,10,14,18,22,26,30,34,38,42,46,50,54,58,62]
6743 ; AVX512DQ-BW-FCP-NEXT: # zmm12 = mem[0,1,2,3,0,1,2,3]
6744 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, %zmm13
6745 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm12, %zmm13
6746 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, %zmm14
6747 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm12, %zmm14
6748 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm13 = zmm14[0,1,2,3],zmm13[4,5,6,7]
6749 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, %zmm14
6750 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm12, %zmm14
6751 ; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm1, %zmm0, %zmm12
6752 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7]
6753 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm14 = [3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63,3,7,11,15,19,23,27,31,35,39,43,47,51,55,59,63]
6754 ; AVX512DQ-BW-FCP-NEXT: # zmm14 = mem[0,1,2,3,0,1,2,3]
6755 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm6, %zmm14, %zmm7
6756 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm14, %zmm5
6757 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm4 = zmm5[0,1,2,3],zmm7[4,5,6,7]
6758 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm14, %zmm2
6759 ; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm14, %zmm0
6760 ; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm2[4,5,6,7]
6761 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, 64(%rsi)
6762 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rsi)
6763 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm11, 64(%rdx)
6764 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, (%rdx)
6765 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 64(%rcx)
6766 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx)
6767 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 64(%r8)
6768 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%r8)
6769 ; AVX512DQ-BW-FCP-NEXT: vzeroupper
6770 ; AVX512DQ-BW-FCP-NEXT: retq
6771 %wide.vec = load <256 x i16>, ptr %in.vec, align 64
6772 %strided.vec0 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <64 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 64, i32 68, i32 72, i32 76, i32 80, i32 84, i32 88, i32 92, i32 96, i32 100, i32 104, i32 108, i32 112, i32 116, i32 120, i32 124, i32 128, i32 132, i32 136, i32 140, i32 144, i32 148, i32 152, i32 156, i32 160, i32 164, i32 168, i32 172, i32 176, i32 180, i32 184, i32 188, i32 192, i32 196, i32 200, i32 204, i32 208, i32 212, i32 216, i32 220, i32 224, i32 228, i32 232, i32 236, i32 240, i32 244, i32 248, i32 252>
6773 %strided.vec1 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <64 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 65, i32 69, i32 73, i32 77, i32 81, i32 85, i32 89, i32 93, i32 97, i32 101, i32 105, i32 109, i32 113, i32 117, i32 121, i32 125, i32 129, i32 133, i32 137, i32 141, i32 145, i32 149, i32 153, i32 157, i32 161, i32 165, i32 169, i32 173, i32 177, i32 181, i32 185, i32 189, i32 193, i32 197, i32 201, i32 205, i32 209, i32 213, i32 217, i32 221, i32 225, i32 229, i32 233, i32 237, i32 241, i32 245, i32 249, i32 253>
6774 %strided.vec2 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <64 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 66, i32 70, i32 74, i32 78, i32 82, i32 86, i32 90, i32 94, i32 98, i32 102, i32 106, i32 110, i32 114, i32 118, i32 122, i32 126, i32 130, i32 134, i32 138, i32 142, i32 146, i32 150, i32 154, i32 158, i32 162, i32 166, i32 170, i32 174, i32 178, i32 182, i32 186, i32 190, i32 194, i32 198, i32 202, i32 206, i32 210, i32 214, i32 218, i32 222, i32 226, i32 230, i32 234, i32 238, i32 242, i32 246, i32 250, i32 254>
6775 %strided.vec3 = shufflevector <256 x i16> %wide.vec, <256 x i16> poison, <64 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63, i32 67, i32 71, i32 75, i32 79, i32 83, i32 87, i32 91, i32 95, i32 99, i32 103, i32 107, i32 111, i32 115, i32 119, i32 123, i32 127, i32 131, i32 135, i32 139, i32 143, i32 147, i32 151, i32 155, i32 159, i32 163, i32 167, i32 171, i32 175, i32 179, i32 183, i32 187, i32 191, i32 195, i32 199, i32 203, i32 207, i32 211, i32 215, i32 219, i32 223, i32 227, i32 231, i32 235, i32 239, i32 243, i32 247, i32 251, i32 255>
6776 store <64 x i16> %strided.vec0, ptr %out.vec0, align 64
6777 store <64 x i16> %strided.vec1, ptr %out.vec1, align 64
6778 store <64 x i16> %strided.vec2, ptr %out.vec2, align 64
6779 store <64 x i16> %strided.vec3, ptr %out.vec3, align 64